In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
dataset = pd.read_csv('./dji/dow_jones_index.data')
In [2]:
print(dataset.head())
In [3]:
observations = {}
for el in dataset[['stock', 'close']].iterrows():
stock = el[1].stock
close = float(el[1].close.replace("$", ""))
try:
observations[stock].append(close)
except KeyError:
observations[stock] = [close]
In [4]:
X = []
stock_names = sorted(observations.keys())
for stock in stock_names:
X.append(observations[stock])
X = np.array(X)
In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
X_train = X[:, :12]
y_train = X[:, 12]
regr_1 = LinearRegression()
regr_1.fit(X_train, y_train)
Out[5]:
In [6]:
plot_vals = []
for offset in range(0, X.shape[1]-X_train.shape[1]):
X_test = X[:, offset:12+offset]
y_test = X[:, 12+offset]
r2 = r2_score(y_test, regr_1.predict(X_test))
mae = mean_absolute_error(y_test, regr_1.predict(X_test))
print("offset=", offset, "r2_score=", r2)
print("offset=", offset, "MAE =", mae)
plot_vals.append( (offset, r2, mae) )
print()
print("r2_score: mean=", np.mean([x[1] for x in plot_vals]), "variance=", np.var([x[1] for x in plot_vals]))
print("mae_score: mean=", np.mean([x[2] for x in plot_vals]), "variance=", np.var([x[2] for x in plot_vals]))
In [7]:
fig, ax1 = plt.subplots()
ax1.plot([x[0] for x in plot_vals], [x[1] for x in plot_vals], 'b-')
ax1.plot(plot_vals[0][0], plot_vals[0][1], 'bo')
ax1.set_xlabel('test week')
ax1.set_ylabel('r2_score', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
ax1.set_ylim([0.9, 1.1])
ax2 = ax1.twinx()
ax2.plot([x[0] for x in plot_vals], [x[2] for x in plot_vals], 'r-')
ax2.plot(plot_vals[0][0], plot_vals[0][2], 'ro')
ax2.set_ylabel('mae score', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
ax2.set_ylim([0, 3.3])
plt.xlim([-.1, 12.1])
plt.show()
In [8]:
training_len = 5
X_train_short = X[:, :training_len]
y_train_short = X[:, training_len]
for offset in range(1, 12-training_len):
X_train_short = np.vstack( (X_train_short, X[:, offset:training_len+offset]) )
y_train_short = np.concatenate( (y_train_short, X[:, training_len+offset]) )
In [9]:
regr_2 = LinearRegression()
regr_2.fit(X_train_short, y_train_short)
Out[9]:
In [10]:
plot_vals = []
for offset in range(0, X.shape[1]-X_train.shape[1]):
X_test = X[:, 12-training_len+offset:12+offset]
y_test = X[:, 12+offset]
r2 = r2_score(y_test, regr_2.predict(X_test))
mae = mean_absolute_error(y_test, regr_2.predict(X_test))
print("offset=", offset, "r2_score=", r2)
print("offset=", offset, "MAE =", mae)
plot_vals.append( (offset, r2, mae) )
print()
print("r2_score: mean=", np.mean([x[1] for x in plot_vals]), "variance=", np.var([x[1] for x in plot_vals]))
print("mae_score: mean=", np.mean([x[2] for x in plot_vals]), "variance=", np.var([x[2] for x in plot_vals]))
In [11]:
fig, ax1 = plt.subplots()
ax1.plot([x[0] for x in plot_vals], [x[1] for x in plot_vals], 'b-')
ax1.plot(plot_vals[0][0], plot_vals[0][1], 'bo')
ax1.set_xlabel('test week')
ax1.set_ylabel('r2_score', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
ax1.set_ylim([0.95, 1.05])
ax2 = ax1.twinx()
ax2.plot([x[0] for x in plot_vals], [x[2] for x in plot_vals], 'r-')
ax2.plot(plot_vals[0][0], plot_vals[0][2], 'ro')
ax2.set_ylabel('mae score', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
ax2.set_ylim([0, 2.2])
plt.xlim([-.1, 12.1])
plt.show()
In [16]:
training_lens = range(1,13)
models = {}
for training_len in training_lens:
X_train_short = X[:, :training_len]
y_train_short = X[:, training_len]
for offset in range(1, 12-training_len):
X_train_short = np.vstack( (X_train_short, X[:, offset:training_len+offset]) )
y_train_short = np.concatenate( (y_train_short, X[:, training_len+offset]) )
regr_x = LinearRegression()
regr_x.fit(X_train_short, y_train_short)
models[training_len] = regr_x
plot_vals = []
for offset in range(0, X.shape[1]-X_train.shape[1]):
X_test = X[:, 12-training_len+offset:12+offset]
y_test = X[:, 12+offset]
r2 = r2_score(y_test, regr_x.predict(X_test))
mae = mean_absolute_error(y_test, regr_x.predict(X_test))
plot_vals.append( (offset, r2, mae) )
fig, ax1 = plt.subplots()
ax1.plot([x[0] for x in plot_vals], [x[1] for x in plot_vals], 'b-')
ax1.plot(plot_vals[0][0], plot_vals[0][1], 'bo')
ax1.set_xlabel('test week')
ax1.set_ylabel('r2_score', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
ax1.set_ylim([0.95, 1.05])
ax2 = ax1.twinx()
ax2.plot([x[0] for x in plot_vals], [x[2] for x in plot_vals], 'r-')
ax2.plot(plot_vals[0][0], plot_vals[0][2], 'ro')
ax2.set_ylabel('mae score', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
ax2.set_ylim([0, max([2.2, 1.1*np.max([x[2] for x in plot_vals])])])
plt.xlim([-.1, 12.1])
plt.title("results with training_len={}".format(training_len))
plt.show()
print("r2_score: mean=", np.mean([x[1] for x in plot_vals]), "variance=", np.var([x[1] for x in plot_vals]))
print("mae_score: mean=", np.mean([x[2] for x in plot_vals]), "variance=", np.var([x[2] for x in plot_vals]))
In [ ]: