The idea behind this project is to build a model for predicting CNX Nifty movements. Algorithms used will try to predict the direction of tomorrow’s exchange closing price, whether it is going to be lower or higher with respect to today's price. Decision process involves CNX Nifty daily returns over the period of time.
At the end I have written a simple trading strategy if the probability of the day will be “up”, zthe strategy purchases 500 shares of CNX Nifty and sells it at the end of the day. I start with 1M INR and buy and sell only playing with this amount of money. And finally compare the result against benchmark.
Features:
CNX Nifty 50
NASDAQ Composite
Frankfurt DAX
London FTSE-100
Tokyo Nikkei-225
Hong Kong Hang Seng
Australia ASX-200
Shanghai Composite Index
I have used yahoo as a data source. Downloaded data-frame contains the following columns:
Date : in days
Open : price of the stock at the opening of the trading
High : highest price of the stock during the trading day
Low : lowest price of the stock during the trading day
Close : price of the stock at the closing of the trading
Volume : amount of stocks traded
Adj Close : price of the stock at the closing of the trading
adjusted with dividends
I am maily interested in the Adj Close.
In [1]:
%pylab inline
import Quandl as qd
import pandas as pd
import pandas.io.data
import seaborn
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
In [2]:
def fetch_data_from_yahoo(symbol, start, end):
df = pandas.io.data.get_data_yahoo(symbol, start, end)
return df
In [3]:
def fetch_and_save_data(symbols, names, start, end):
ret_df=[]
for symbol, name in zip(symbols, names):
df = fetch_data_from_yahoo(symbol, start, end)
df.to_csv('./Data/'+name+'_'+start+'_'+end+'.csv',mode='w')
ret_df.append(df)
return ret_df
In [4]:
def daily_return(dataset):
dataset['Daily Return'] = dataset['Adj Close'].pct_change()
return dataset
In [5]:
def plot_data(data_frames, names, subplot=False, market_names=[]):
import math
row = math.ceil((len(data_frames)*1.0)/2)
if subplot:
plt.figure((1), figsize=(80,30))
i=1
for data_frame in data_frames:
if subplot:
plt.subplot(row, 2, i)
if(len(names)==1):
data_frame[names].plot(ax=plt.gca())
else:
df=[]
for name in names:
df.append(data_frame[name])
d = pd.concat(df, axis=1)
d.plot()
plt.title("========= Data ====== ")
plt.legend(loc='best')
i += 1
In [6]:
def get_moving_average(data_frame, intervals):
for interval in intervals:
data_frame['MA_'+str(interval)] = pd.rolling_mean(data_frame['Adj Close'], interval)
return data_frame
In [7]:
def get_ewma(data_frame, intervals):
for interval in intervals:
data_frame['EMA_'+str(interval)] = pd.ewma(data_frame['Adj Close'], span=interval)
return data_frame
In [8]:
def concat(a, b):
return a+b
In [9]:
def renamme_columns(data_frames, market_names):
for market_name,data_frame in zip(market_names, data_frames):
columns = data_frame.columns
data_frame.rename(columns=lambda x: concat(x, '_'+market_name), inplace=True)
In [10]:
def merge_data_frames(datasets, index):
return pd.concat([data_frame.ix[:, index:] for data_frame in data_frames], axis=1)
In [11]:
def count_missing(dataframe):
return (dataframe.shape[0] * dataframe.shape[1]) - dataframe.count().sum()
Add an extra column in the data frame for change in daily return. If rtuern is greater than 0 make it +1 else -1.
Split data into training and test data.
In [65]:
def prepare_data_for_classification(dataset, start_test):
le = preprocessing.LabelEncoder()
dataset['UpDown'] = dataset['Daily Return_nse']
dataset.UpDown[dataset.UpDown >= 0] = 1
dataset.UpDown[dataset.UpDown < 0] = -1
dataset['UpDown'].fillna(1, inplace=True)
dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
features = dataset.columns[1:-1]
X = dataset[features]
y = dataset.UpDown
X_train = X[X.index < start_test]
y_train = y[y.index < start_test]
X_test = X[X.index >= start_test]
y_test = y[y.index >= start_test]
return X_train, y_train, X_test, y_test
==================================================================================================================== Machine Learning Related Functions
In [13]:
def apply_svc(X_train,y_train, X_test, y_test, kernel='linear', C=1):
clf = SVC(kernel=kernel, C=C)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("Accuracy for SVM Classifier %s" %accuracy)
In [14]:
def apply_knn(X_train,y_train, X_test, y_test):
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("Accuracy for KNN Classifier %s" %accuracy)
In [15]:
def apply_random_forest(X_train,y_train, X_test, y_test):
clf = RandomForestClassifier(n_estimators=5, n_jobs=-1)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("Accuracy for RF Classifier %s" %accuracy)
In [16]:
def select_best_param_svc(X_train,y_train,parameters):
svr=SVC()
clf = GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)
print ("Best Parameter SVC",clf.best_params_)
In [17]:
def get_svc_prediction(X_train,y_train, x_predict, kernel='linear', C=1):
clf = SVC(kernel=kernel, C=C)
clf.fit(X_train, y_train)
return clf.predict(x_predict)
In [ ]:
** Porfolio Analysis
In [18]:
class MarketIntradayPortfolio():
def __init__(self, symbol, bars, signals, initial_capital=1000000.0, shares=500):
self.symbol = symbol
self.bars = bars
self.signals = signals
self.initial_capital = float(initial_capital)
self.shares = int(shares)
self.positions = self.generate_positions()
def generate_positions(self):
positions = pd.DataFrame(index=self.signals.index).fillna(0.0)
positions[self.symbol] = self.shares*self.signals['signal']
return positions
def backtest_portfolio(self):
portfolio = pd.DataFrame(index=self.positions.index)
pos_diff = self.positions.diff()
portfolio['price_diff'] = self.bars['Adj Close']-self.bars['Open']
#portfolio['price_diff'][0:5] = 0.0
portfolio['profit'] = self.positions[self.symbol] * portfolio['price_diff']
portfolio['total'] = self.initial_capital + portfolio['profit'].cumsum()
portfolio['returns'] = portfolio['total'].pct_change()
return portfolio
In [ ]:
===================================================================================================================
In [66]:
start_date='2014-01-01'
end_date='2015-10-01'
In [67]:
indices=['^NSEI','^DJI','^FTSE','^AXJO','^HSI','^N225','^IXIC','000001.SS']
market_name=['nse','dji','ftse','aus','hsi', 'nikkei','nasdaq','sanghai']
In [68]:
data_frames=fetch_and_save_data(indices, market_name, start_date, end_date)
In [69]:
for data_frame in data_frames:
data_frame['Adj Close'].plot()
In [70]:
for data_frame in data_frames:
daily_return(data_frame)
In [71]:
plot_data(data_frames, ['Daily Return'])
In [72]:
data_frames[0]['Daily Return'].plot()
Out[72]:
In [73]:
data_frames[0].index
Out[73]:
In [74]:
for data_frame in data_frames:
data_frame = get_moving_average(data_frame, [5,10,15,20])
In [75]:
cols=['Adj Close','MA_5','MA_10','MA_15','MA_20']
for col in cols:
data_frames[0][col].plot(legend=True)
plt.title('CNX Nifty ADj Close & MA')
In [76]:
for data_frame in data_frames:
data_frame = get_ewma(data_frame, [5,10,15,20])
In [77]:
cols=['Adj Close','EMA_5','EMA_10','EMA_15','EMA_20']
for col in cols:
data_frames[0][col].plot(legend=True)
plt.title('CNX Nifty ADj Close & EMA')
In [78]:
cols=['Adj Close','EMA_5','EMA_10','EMA_15','EMA_20']
for col in cols:
data_frames[1][col].plot(legend=True)
plt.title('DJI ADj Close & EMA')
In [79]:
for data_frame in data_frames:
data_frame=get_ewma(data_frame, [5,10,15,20])
plot_data(data_frames, ['Adj Close','EMA_5','EMA_10','EMA_15','EMA_20'])
In [80]:
renamme_columns(data_frames, market_name)
In [81]:
data_frames[1].index.name
Out[81]:
In [82]:
print(data_frames[1].columns)
In [83]:
for name, data_frame in zip(market_name, data_frames):
print ("No of Data for [%8s] are [%s]"%(name,len(data_frame)))
Merge Data Frames
We are interested in colums after Adjusted close.
In [84]:
merged_data = merge_data_frames(data_frames, 5)
In [85]:
print(merged_data.columns)
In [39]:
merged_data.describe()
Out[39]:
In [86]:
merged_data.Return_CNX_NIFTY = merged_data['Daily Return_nse'].shift(-1)
In [87]:
merged_data.Return_CNX_NIFTY.plot()
Out[87]:
In [88]:
merged_data['Adj Close_nse'].plot()
Out[88]:
Due to missing data plot is broken.
In [ ]:
Count missing data(NaN) after simple merge
In [89]:
print("Shape of merged data",merged_data.shape,".")
print("After merge out of [",len(merged_data)*len(merged_data.columns),"] [",count_missing(merged_data),"] data points are missing.")
In [90]:
merged_data = merged_data.interpolate(method='time')
print ('Number of NaN after time interpolation: %s' % str(count_missing(merged_data)))
In [91]:
merged_data = merged_data.fillna(merged_data.mean())
print ('Number of NaN after mean interpolation: %s' %count_missing(merged_data))
In [92]:
merged_data['Adj Close_nse'].plot()
Out[92]:
In [93]:
merged_data['Daily Return_nse'].plot()
Out[93]:
In [94]:
X_train, y_train, X_test, y_test = prepare_data_for_classification(merged_data['2014-01-01':'2015-08-01'], '2015-06-01')
In [95]:
print ("======== Shapes ======== ")
print ("Training X",X_train.shape)
print ("Training y",y_train.shape)
print ("Test X",X_test.shape)
print ("Test y",y_test.shape)
print ("======================== ")
In [96]:
plt.figure()
y_test.plot(kind='bar', alpha=0.5)
plt.axhline(0, color='k')
Out[96]:
In [97]:
plt.figure()
y_train.plot(kind='bar', alpha=0.9)
plt.axhline(0, color='r')
Out[97]:
In [98]:
print ("Postive and negative movement in train data outcome.")
print (y_train.value_counts())
print ("Postive and negative movement in test data outcome.")
print (y_test.value_counts())
Data points are evenly distributed.
===================================================================================================================
Machine Learning classifier
In [99]:
apply_svc(X_train, y_train, X_test, y_test)
In [100]:
apply_knn(X_train, y_train, X_test, y_test)
In [101]:
apply_random_forest(X_train, y_train, X_test, y_test)
In [102]:
apply_random_forest(X_train, y_train, X_test, y_test)
In [103]:
parameters = {'kernel':( 'linear', 'rbf'), 'C':[1, 10]}
select_best_param_svc(X_train, y_train, parameters)
In [104]:
apply_svc(X_train, y_train, X_test, y_test, kernel='rbf', C=1)
In [105]:
apply_svc(X_train, y_train, X_test, y_test, kernel='linear', C=1)
In [106]:
b_start_date='2015/07/30'
b_end_date='2015/10/01'
In [107]:
symbol = 'CNX-NIFTY'
bars = fetch_data_from_yahoo('^NSEI', b_start_date, b_end_date)
X_train, y_train, X_test, y_test = prepare_data_for_classification(merged_data, '2015-08-01')
predict=get_svc_prediction(X_train, y_train, X_test)
In [108]:
signals = pd.DataFrame(index=bars.index)
signals['signal'] = 0.0
signals['signal'] = predict
In [109]:
signals['positions'] = signals['signal'].diff()
portfolio = MarketIntradayPortfolio(symbol, bars, signals)
returns = portfolio.backtest_portfolio()
print (returns.tail(5))
In [110]:
f, ax = plt.subplots(2, sharex=True)
f.patch.set_facecolor('white')
ylabel = symbol + ' Close Price in Rs'
bars['Close'].plot(ax=ax[0], color='r', lw=3.)
ax[0].set_ylabel(ylabel, fontsize=10)
ax[0].set_xlabel('', fontsize=14)
ax[0].legend(('Close Price CNX-NIFTY',), loc='upper left', prop={"size":14})
ax[0].set_title('CNX-NIFTY Close Price VS Portofolio Performance (31 July 2015 - 01 Oct 2015)', fontsize=16, fontweight="bold")
returns['total'].plot(ax=ax[1], color='b', lw=3.)
ax[1].set_ylabel('Portfolio value in Rs', fontsize=10)
ax[1].set_xlabel('Date', fontsize=14)
ax[1].legend(('Portofolio Performance.',), loc='upper left', prop={"size":18})
plt.tick_params(axis='both', which='major', labelsize=10)
loc = ax[1].xaxis.get_major_locator()
loc.maxticks[DAILY] = 24
plt.show()
Performance
Above is the CNX Nifty Close Price between 31 July 2015 and 01 October 2015. First graph shows the actual trend of the market index for above period. In this particular period the market had a return is negative (-4.76%).
Portofolio return for the same period. Second graph shows the trend of the Porfolio generated on top of our predictions. Start value is 1M INR which end up at a final value, after 3 months of trading, of about 26%.
In [ ]: