In [1]:

    
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb
import operator

import settings
from ta import *
import utils

Load Data



In [3]:

    
df = pd.read_csv('data/datas-Hourly.csv', sep=',')



In [4]:

    
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))









    



Number of rows: 58725, Number of columns: 8

Preprocessing



In [5]:

    
df = utils.dropna(df)



In [6]:

    
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))









    



Number of rows: 52974, Number of columns: 8

Transformation



In [7]:

    
# shift
df['Close_target'] = df['Close'].shift(-1)
df = df.dropna()

Create column target with class [UP, KEEP, DOWN]



In [8]:

    
df["Target"] = 0 # 'KEEP'
df.loc[df.Close + (df.Close * settings.PERCENT_UP) < df.Close_target, "Target"] = 1 # 'UP'
df.loc[df.Close - (df.Close * settings.PERCENT_DOWN) > df.Close_target, "Target"] = 2 # 'DOWN'



In [9]:

    
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))









    



Number of rows: 52973, Number of columns: 10
Number of UP rows: 4867, Number of DOWN rows: 4488

Create columns from Timestamp to Date, Year, Month, Hour, etc.

Feature Engineering



In [10]:

    
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour



In [11]:

    
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))









    



Number of rows: 52973, Number of columns: 17

Technical Analysis

https://en.wikipedia.org/wiki/Technical_analysis

https://github.com/bukosabino/ta



In [12]:

    
df = add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume_BTC", fillna=True)



In [13]:

    
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
df = df.dropna()
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))









    



Number of rows: 52973, Number of columns: 65
Number of rows: 52973, Number of columns: 65

Split



In [14]:

    
train, test = utils.split_df2(df)



In [15]:

    
excl = ['Close_target', 'Target', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

xgboost



In [16]:

    
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 1,
    'colsample_bytree': 0.95,
    'colsample_bylevel': 0.95,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
"""
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=5000,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
"""
num_boost_rounds = 705

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

# predict
y_pred = model.predict(dtest)
y_true = test['Target']

utils.metrics(y_true, y_pred)

print("\n \n \n \n \n \n ********** WEIGHT ************")
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
    print (i)
    
print("\n \n \n \n \n \n ********** GAIN ************")
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
    print (i)
    
print("\n \n \n \n \n \n ********** COVER ************")
importance = model.get_score(fmap='', importance_type='cover')
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
    print (i)









    



705
Accuracy: 0.820522500755059
Coefficient Kappa: 0.17967900398020387
Classification Report:
             precision    recall  f1-score   support

       KEEP       0.85      0.97      0.91     10891
         UP       0.34      0.14      0.20      1198
       DOWN       0.35      0.08      0.13      1155

avg / total       0.76      0.82      0.78     13244

Confussion Matrix:
[[10609   192    90]
 [  953   164    81]
 [  939   122    94]]

 
 
 
 
 
 ********** WEIGHT ************
('volatility11', 119)
('volatility10', 238)
('others2', 798)
('volatility5', 1392)
('volatility6', 1596)
('trend9', 1790)
('volatility15', 3119)
('volatility12', 3776)
('volatility14', 3944)
('trend4', 3955)
('Weighted_Price', 4485)
('volatility13', 6040)
('volatility8', 6496)
('volatility9', 7402)
('volatility4', 7767)
('volatility7', 8267)
('Year', 8818)
('trend5', 9663)
('Close', 11246)
('Low', 16553)
('trend21', 18256)
('trend20', 19700)
('High', 22685)
('volatility2', 24426)
('volatility3', 26839)
('Month', 30740)
('Weekday', 42243)
('Week', 42491)
('trend1', 70733)
('Open', 72519)
('trend2', 74092)
('Day', 76947)
('momentum3', 77056)
('trend17', 79030)
('volume2', 85328)
('Volume_Currency', 85556)
('trend11', 86091)
('trend10', 86362)
('trend13', 88156)
('trend12', 89501)
('trend18', 90079)
('momentum1', 91908)
('trend3', 95531)
('Hour', 103286)
('trend15', 104287)
('trend8', 105909)
('volatility1', 106580)
('trend7', 107938)
('trend19', 108027)
('momentum2', 108027)
('volume7', 110940)
('volume3', 124293)
('volume8', 126557)
('trend6', 129330)
('trend14', 131858)
('Volume_BTC', 132663)
('trend16', 132890)
('volume5', 133373)
('volume6', 137591)
('others1', 142877)
('volume1', 145204)

 
 
 
 
 
 ********** GAIN ************
('volatility10', 0.5170872642436973)
('volatility11', 0.6923272478991596)
('Hour', 0.7776092010327927)
('Month', 0.7852729577103852)
('Day', 0.7876862054303283)
('volatility5', 0.7979466844181038)
('volume5', 0.8210902556455228)
('volatility14', 0.825407780302206)
('Weekday', 0.8293042292520628)
('trend6', 0.8393059854688465)
('Open', 0.8458727616243855)
('momentum2', 0.8474287701248672)
('volatility15', 0.8495073505598585)
('trend8', 0.8547808506905206)
('volume2', 0.8592118954569066)
('volume3', 0.8604327502105438)
('trend1', 0.8616572123865395)
('volume6', 0.8646017543505381)
('trend14', 0.8693222377657203)
('trend10', 0.8754798352735653)
('High', 0.877790817767375)
('trend12', 0.8781832139771398)
('volume7', 0.882191959332056)
('trend7', 0.8864446714433019)
('trend16', 0.8915465343589873)
('trend2', 0.8946374629211571)
('trend11', 0.9001826155240248)
('Week', 0.9012241996892983)
('volume1', 0.9047467269961832)
('trend3', 0.910549631778559)
('Volume_BTC', 0.9114029024833612)
('momentum3', 0.9207257142873336)
('trend15', 0.9439241988852526)
('Volume_Currency', 0.9530427238455116)
('trend9', 0.9550994335195528)
('volatility2', 0.9794609926783743)
('Low', 0.9856682138318975)
('volume8', 1.0178480700506438)
('volatility6', 1.0342069722380962)
('momentum1', 1.0656084457519945)
('trend19', 1.0729989191660079)
('trend20', 1.081665951275685)
('volatility4', 1.149141552223083)
('Close', 1.2135379808866615)
('volatility8', 1.216294072809271)
('trend5', 1.2403155348239707)
('Weighted_Price', 1.242261133136233)
('volatility7', 1.2642245512622454)
('trend21', 1.3242578875575186)
('trend4', 1.3590979102683443)
('volatility12', 1.444297699334481)
('others2', 1.4933001948370925)
('trend18', 1.504699381417896)
('volatility3', 1.5825646511830562)
('volatility9', 1.7304209456336155)
('volatility1', 1.8404477313210088)
('others1', 1.948720251290734)
('trend17', 2.047671988596528)
('trend13', 2.424660373838925)
('Year', 3.2222409395096445)
('volatility13', 7.313006800213395)

 
 
 
 
 
 ********** COVER ************
('trend9', 25.804671592178796)
('Hour', 28.143664812946596)
('Weekday', 28.450029041497867)
('Day', 31.309943080561446)
('volatility5', 32.37619881465514)
('Open', 36.18827653042627)
('Month', 43.83631440858792)
('trend10', 44.47150315613339)
('trend8', 47.96643680848708)
('volatility10', 48.76242525210083)
('volume7', 52.63751205390411)
('trend12', 52.852942747009635)
('momentum2', 54.918665353476264)
('trend16', 55.75632073301281)
('trend6', 56.03004242751138)
('volume6', 56.36601787820345)
('volume1', 56.46354193562165)
('volume3', 57.76640951059219)
('trend7', 60.77275456966106)
('trend11', 60.78656070378944)
('volume5', 61.59079100275124)
('trend1', 66.12234993397718)
('trend3', 67.58943740523996)
('volume2', 68.52123259176518)
('trend14', 71.26857713153744)
('volatility11', 75.49387336134457)
('momentum3', 76.00257799898687)
('Volume_Currency', 77.08950059914031)
('High', 78.36661411373156)
('trend15', 78.41498752462)
('volatility15', 81.02395817249112)
('Week', 87.49800016921324)
('momentum1', 91.23010201016247)
('volume8', 91.42220374953699)
('Volume_BTC', 95.93807241664827)
('trend2', 107.18907846609814)
('volatility2', 112.50569906697801)
('volatility14', 123.99511832657205)
('Low', 124.78914586056896)
('volatility4', 125.5098015037974)
('volatility6', 126.88722194862169)
('trend19', 152.25598072083815)
('volatility8', 155.60228989070154)
('trend20', 156.42042407918845)
('volatility7', 164.47142885811044)
('Close', 173.2054934963535)
('Weighted_Price', 181.79089540468203)
('others1', 194.52295166786772)
('trend18', 204.86761891428418)
('trend5', 212.98152218876058)
('trend21', 222.4159118498022)
('volatility12', 235.93438504502103)
('trend17', 243.49888483677063)
('trend13', 248.38765379225944)
('volatility3', 263.6186295931295)
('others2', 286.98657634085237)
('trend4', 291.0410887206063)
('volatility9', 322.6777623196429)
('volatility1', 360.169337692341)
('Year', 764.324576829213)
('volatility13', 1350.233484963579)