In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb
import operator

import settings
import utils
import get_data
from ta import *


/home/bukosabino/envs/deeplearning/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Get Data

API: http://bitcoincharts.com/charts

period = ['1-min', '5-min', '15-min', '30-min', 'Hourly', '2-hour', '6-hour', '12-hour', 'Daily', 'Weekly']

market = ['krakenEUR', 'bitstampUSD'] -> list of markets: https://bitcoincharts.com/charts/volumepie/


In [2]:
# get_data.get('data/datas.csv', period=settings.PERIOD, market=settings.MARKET)

Load Data


In [3]:
df = pd.read_csv('data/datas.csv', sep=',')

In [4]:
# add next row
last_timestamp = df['Timestamp'].iloc[-1]
if settings.PERIOD == 'Hourly':
    next_timestamp = last_timestamp + 3600
df_next = pd.DataFrame([next_timestamp], columns=['Timestamp'])
df = df.append(df_next, ignore_index=True)
df.iloc[-1] = df.iloc[-1].fillna(1)

In [5]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))


Number of rows: 51212, Number of columns: 8

Preprocessing


In [6]:
df = utils.dropna(df)

In [7]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))


Number of rows: 45461, Number of columns: 8

Transformation

Create column target with class [UP, KEEP, DOWN]


In [8]:
df['Target'] = 0 # 'KEEP'
df.loc[df.Open + (df.Open * settings.PERCENT_UP) < df.Close, 'Target'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN) > df.Close, 'Target'] = 2 # 'DOWN'

In [9]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))


Number of rows: 45461, Number of columns: 9
Number of UP rows: 3301, Number of DOWN rows: 3115

Create columns from Timestamp to Date, Year, Month, Hour, etc.

Feature Engineering


In [10]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour

# extra dates
# df["yearmonth"] = df["Date"].dt.year*100 + df["Date"].dt.month
# df["yearweek"] = df["Date"].dt.year*100 + df["Date"].dt.weekofyear
# df["yearweekday"] = df["Date"].dt.year*10 + df["Date"].dt.weekday

In [11]:
# shift
cols = ['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'Weighted_Price']
for col in cols:
    df[col] = df[col].shift(1)
df = df.dropna()

In [12]:
df['High-low'] = df['High'] - df['Low']
df['Close-open'] = df['Close'] - df['Open']

df['Up_or_Down'] = 0 # 'UP' or 'DOWN' if diff > settings.PERCENT_UP
df.loc[( df.Open + (df.Open * settings.PERCENT_UP) ) < df.Close, 'Up_or_Down'] = 1 # 'UP'
df.loc[( df.Open - (df.Open * settings.PERCENT_DOWN) ) > df.Close, 'Up_or_Down'] = 2 # 'DOWN'

df['Up_or_Down_2'] = 0 # 'UP' or 'DOWN' if diff > settings.PERCENT_UP * 2
df.loc[df.Open + (df.Open * settings.PERCENT_UP * 2 ) < df.Close, 'Up_or_Down_2'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN * 2) > df.Close, 'Up_or_Down_2'] = 2 # 'DOWN'

df['Up_or_Down_3'] = 0 # 'UP' or 'DOWN' if diff > 0
df.loc[df.Open < df.Close, 'Up_or_Down_3'] = 1 # 'UP'
df.loc[df.Open  > df.Close, 'Up_or_Down_3'] = 2 # 'DOWN'

df['Up_or_Down_4'] = 0 # 'UP' or 'DOWN' if diff > settings.PERCENT_UP / 2
df.loc[df.Open + (df.Open * settings.PERCENT_UP / 2 ) < df.Close, 'Up_or_Down_4'] = 1 # 'UP'
df.loc[df.Open - (df.Open * settings.PERCENT_DOWN / 2) > df.Close, 'Up_or_Down_4'] = 2 # 'DOWN'

In [13]:
# Fundamental analysis

# daily return
df['Daily_return'] = (df['Close'] / df['Close'].shift(1)) - 1
df['Daily_return_100'] = ((df['Close'] / df['Close'].shift(1)) - 1) * 100

# cumulative return
df = df.dropna()
df['Cumulative_return'] = (df['Close'] / df['Close'].iloc[0]) - 1
df['Cumulative_return_100'] = ((df['Close'] / df['Close'].iloc[0]) - 1) * 100

# TODO: cumulative return week, month, year...

In [14]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))


Number of rows: 45459, Number of columns: 26

Technical Analysis

Volume-based indicators


In [15]:
# Accumulation/Distribution index
df['Acc_Dist_Roc_BTC'] = acc_dist_roc(df, 'Volume_BTC', 2)
df['Acc_Dist_Roc_Currency'] = acc_dist_roc(df, 'Volume_Currency', 2)
df['Acc_Dist_BTC'] = acc_dist_index(df, 'Volume_BTC')
df['Acc_Dist_Currency'] = acc_dist_index(df, 'Volume_Currency')

# Chaikin Money Flow
df['Chaikin_Money_Flow_1_BTC'] = chaikin_money_flow1(df, 'Volume_BTC')
df['Chaikin_Money_Flow_2_BTC'] = chaikin_money_flow2(df, 'Volume_BTC', 20)
df['Chaikin_Money_Flow_3_BTC'] = chaikin_money_flow3(df, 'Volume_BTC', 20)
df['Chaikin_Money_Flow_1_Currency'] = chaikin_money_flow1(df, 'Volume_Currency')
df['Chaikin_Money_Flow_2_Currency'] = chaikin_money_flow2(df, 'Volume_Currency', 20)
df['Chaikin_Money_Flow_3_Currency'] = chaikin_money_flow3(df, 'Volume_Currency', 20)

# Money Flow Index
df['Money_Flow_BTC'] = money_flow_index(df, 'Volume_BTC', 14)
df['Money_Flow_Currency'] = money_flow_index(df, 'Volume_Currency', 14)

# On-balance volume
df['OBV_BTC'] = on_balance_volume(df, 'Volume_BTC')
df['OBV_BTC_mean'] = on_balance_volume_mean(df, 'Volume_BTC')
df['OBV_Currency'] = on_balance_volume(df, 'Volume_Currency')
df['OBV_Currency_mean'] = on_balance_volume_mean(df, 'Volume_Currency')

# Force Index
df['Force_Index_BTC'] = force(df, 'Volume_BTC', 2)
df['Force_Index_Currency'] = force(df, 'Volume_Currency', 2)

# delete intermediate columns
df.drop('OBV', axis=1, inplace=True)


ta/volume.py:38: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=3,min_periods=2,adjust=True).mean()
  return pd.Series(pd.ewma(ad, span=3, min_periods=2) - pd.ewma(ad, span=10, min_periods=9))
ta/volume.py:38: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=10,min_periods=9,adjust=True).mean()
  return pd.Series(pd.ewma(ad, span=3, min_periods=2) - pd.ewma(ad, span=10, min_periods=9))
ta/volume.py:48: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
  return pd.Series(pd.rolling_mean(mf, n))
ta/volume.py:57: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).sum()
  return pd.Series(moments.rolling_sum(clv*df[col_volume], n) / moments.rolling_sum(df[col_volume], n))
ta/volume.py:74: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  n_positive_mf = pd.rolling_sum(df['1_Period_Positive_Money_Flow'], n)
ta/volume.py:75: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  n_negative_mf = pd.rolling_sum(df['1_Period_Negative_Money_Flow'], n)
ta/volume.py:102: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).mean()
  return pd.Series(pd.rolling_mean(df['OBV'], n))

Trend indicators


In [16]:
# Moving Average Convergence Divergence
df[['MACD', 'MACD_sign', 'MACD_diff']] = macd(df, 12, 26, 9)

# Average directional movement index
df[['ADX', 'ADX_pos', 'ADX_neg']] = adx(df, 14)

# Vortex indicator
df[['Vortex_pos', 'Vortex_neg']] = vortex(df, 14)


ta/trend.py:27: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=12,min_periods=25,adjust=True).mean()
  EMAfast = pd.Series(pd.ewma(df['Close'], span=n_fast, min_periods=n_slow - 1))
ta/trend.py:28: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=26,min_periods=25,adjust=True).mean()
  EMAslow = pd.Series(pd.ewma(df['Close'], span=n_slow, min_periods=n_slow - 1))
ta/trend.py:30: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=9,min_periods=8,adjust=True).mean()
  MACDsign = pd.Series(pd.ewma(MACD, span=n_sign, min_periods=n_sign - 1), name='MACD_sign_%d_%d' % (n_fast, n_slow))
ta/trend.py:57: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  trs = pd.rolling_sum(tr, n)
ta/trend.py:68: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  dip = 100 * pd.rolling_sum(pos, n) / trs
ta/trend.py:69: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  din = 100 * pd.rolling_sum(neg, n) / trs
ta/trend.py:72: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,min_periods=0,adjust=True,com=14).mean()
  adx = pd.ewma(dx, n)
ta/trend.py:87: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  trn = moments.rolling_sum(tr, n)
ta/trend.py:92: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  vip = moments.rolling_sum(vmp, n) / trn
ta/trend.py:93: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).sum()
  vin = moments.rolling_sum(vmm, n) / trn

Momentum Indicators


In [17]:
df['RSI'] = rsi(df, 14)


ta/momentum.py:16: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,min_periods=0,adjust=True,com=14).mean()
  emaup = pd.ewma(up, n)
ta/momentum.py:17: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,min_periods=0,adjust=True,com=14).mean()
  emadn = pd.ewma(dn, n)

In [18]:
"""
for c in df.columns:
    print str(c) + u' - ' + str(df[c].isnull().sum())
"""


Out[18]:
"\nfor c in df.columns:\n    print str(c) + u' - ' + str(df[c].isnull().sum())\n"

Price-based indicators


In [19]:
# Momentum
for idx in range(9):
    m = idx+2
    df['Momentum_'+str(m)] = ((df['Close'] / df['Close'].shift(m)) - 1)

# Rollings
for idx in range(9):
    m = idx+2
    df['Rolling_mean_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).mean()).values
    df['Rolling_std_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).std()).values
    df['Rolling_cov_'+str(m)] = (df.set_index('Date')['Close'].rolling(window=m).cov()).values

# Bollinger bands
for idx in range(9):
    m = idx+2
    df['Bollinger_band_mean_'+str(m)+'_max'] = df['Rolling_mean_'+str(m)] + (2*df['Rolling_std_'+str(m)])
    df['Bollinger_band_mean_'+str(m)+'_min'] = df['Rolling_mean_'+str(m)] - (2*df['Rolling_std_'+str(m)])

In [20]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
df = df.dropna()
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))


Number of rows: 45459, Number of columns: 107
Number of rows: 45394, Number of columns: 107

Split


In [21]:
train, test = utils.split_df(df)

In [22]:
excl = ['Target', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]

xgboost


In [ ]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
    'n_trees': 800,
    'eta': 0.0045,
    'max_depth': 20,
    'subsample': 0.95,
    'colsample_bytree': 0.95,
    'colsample_bylevel': 0.95,
    'objective': 'multi:softmax',
    'num_class' : 3,
    'eval_metric': 'mlogloss', # 'merror', # 'rmse',
    'base_score': 0,
    'silent': 1
}

dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])

cv_result = xgb.cv(xgb_params, dtrain)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   num_boost_round=5000,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)

# num_boost_rounds = 1000

print(num_boost_rounds)

# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

# predict
y_pred = model.predict(dtest)
y_true = test['Target']

prediction_value = y_true.tolist()[0]

if prediction_value == 1.0:
    print("Prediction: UP")
elif prediction_value == 2.0:
    print("Prediction: DOWN")
else: # 0.0
    print("Prediction: KEEP")

print "\n \n \n \n \n \n ********** WEIGHT ************"
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
    print i
    
print "\n \n \n \n \n \n ********** GAIN ************"
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
    print i


[0]	train-mlogloss:1.09319	test-mlogloss:1.09409
[50]	train-mlogloss:0.865379	test-mlogloss:0.907757
[100]	train-mlogloss:0.699335	test-mlogloss:0.777195
[150]	train-mlogloss:0.57352	test-mlogloss:0.68265
[200]	train-mlogloss:0.47552	test-mlogloss:0.612951
[250]	train-mlogloss:0.397623	test-mlogloss:0.561003
[300]	train-mlogloss:0.334694	test-mlogloss:0.522111
[350]	train-mlogloss:0.283236	test-mlogloss:0.492986
[400]	train-mlogloss:0.240706	test-mlogloss:0.471339
[450]	train-mlogloss:0.20543	test-mlogloss:0.455488
[500]	train-mlogloss:0.176122	test-mlogloss:0.44411
[550]	train-mlogloss:0.151703	test-mlogloss:0.436222
[600]	train-mlogloss:0.131343	test-mlogloss:0.431155
[650]	train-mlogloss:0.11435	test-mlogloss:0.428371
[700]	train-mlogloss:0.10007	test-mlogloss:0.427287
[750]	train-mlogloss:0.0881423	test-mlogloss:0.427607
714
Prediction: KEEP

 
 
 
 
 
 ********** WEIGHT ************
('Cumulative_return_100', 27)
('Cumulative_return', 390)
('Up_or_Down_3', 1087)
('Rolling_mean_2', 1236)
('Rolling_mean_9', 1365)
('Rolling_mean_8', 1404)
('Rolling_mean_10', 1497)
('Rolling_mean_7', 1518)
('Rolling_mean_6', 1564)
('Weighted_Price', 1776)
('Rolling_mean_5', 1790)
('Bollinger_band_mean_8_max', 1803)
('Rolling_mean_4', 1926)
('Bollinger_band_mean_7_max', 1928)
('Rolling_mean_3', 2021)
('Bollinger_band_mean_9_max', 2102)
('Bollinger_band_mean_6_max', 2259)
('Bollinger_band_mean_5_max', 2384)
('Bollinger_band_mean_7_min', 2430)
('Rolling_cov_9', 2481)
('Up_or_Down_2', 2482)
('Bollinger_band_mean_8_min', 2532)
('Rolling_cov_7', 2584)
('Bollinger_band_mean_6_min', 2618)
('Bollinger_band_mean_10_max', 2631)
('Up_or_Down', 2656)
('Bollinger_band_mean_9_min', 2711)
('Rolling_cov_8', 2752)
('Bollinger_band_mean_4_max', 2929)
('Rolling_cov_6', 3001)
('Rolling_cov_10', 3189)
('Bollinger_band_mean_5_min', 3238)
('Bollinger_band_mean_3_max', 3329)
('Bollinger_band_mean_10_min', 3380)
('Bollinger_band_mean_4_min', 3646)
('Bollinger_band_mean_3_min', 4040)
('Bollinger_band_mean_2_max', 4308)
('Up_or_Down_4', 4339)
('Bollinger_band_mean_2_min', 4511)
('Open', 4651)
('Rolling_cov_5', 4746)
('OBV_Currency', 4990)
('Rolling_cov_4', 5304)
('Rolling_cov_3', 6211)
('Low', 6276)
('Daily_return_100', 8621)
('OBV_Currency_mean', 8763)
('Year', 10316)
('High', 10402)
('Rolling_cov_2', 10487)
('Acc_Dist_Roc_Currency', 14673)
('Chaikin_Money_Flow_3_Currency', 17320)
('Rolling_std_9', 23939)
('Rolling_std_8', 24036)
('Rolling_std_7', 24640)
('Month', 27944)
('Rolling_std_10', 29021)
('Rolling_std_6', 29164)
('Rolling_std_5', 32027)
('Weekday', 37152)
('Money_Flow_Currency', 37467)
('Force_Index_Currency', 39773)
('Rolling_std_4', 39775)
('Week', 40557)
('Chaikin_Money_Flow_1_Currency', 41247)
('Rolling_std_3', 42046)
('Acc_Dist_Currency', 43287)
('Chaikin_Money_Flow_2_Currency', 44701)
('Rolling_std_2', 46386)
('Close', 47521)
('OBV_BTC', 47541)
('Volume_Currency', 50124)
('High-low', 57275)
('Close-open', 60216)
('MACD', 61742)
('MACD_sign', 62046)
('RSI', 63136)
('Chaikin_Money_Flow_1_BTC', 63638)
('Chaikin_Money_Flow_3_BTC', 65609)
('Day', 66716)
('Vortex_pos', 67091)
('Vortex_neg', 68111)
('Hour', 69583)
('Force_Index_BTC', 69640)
('Momentum_8', 69921)
('MACD_diff', 70665)
('Momentum_9', 71557)
('Chaikin_Money_Flow_2_BTC', 71923)
('Momentum_6', 72125)
('Acc_Dist_BTC', 72244)
('Money_Flow_BTC', 72843)
('Momentum_7', 73485)
('Momentum_10', 73496)
('Momentum_5', 75876)
('Momentum_2', 78745)
('ADX_pos', 79034)
('ADX_neg', 80115)
('Momentum_3', 81444)
('Momentum_4', 82088)
('OBV_BTC_mean', 83296)
('Volume_BTC', 84223)
('Daily_return', 84516)
('ADX', 99026)
('Acc_Dist_Roc_BTC', 101354)

 
 
 
 
 
 ********** GAIN ************

In [ ]: