Linear Regression - predicting stock price


In [1]:
import pandas as pd
import quandl as qdl

# downloading the stock data
df = qdl.get('WIKI/GOOGL')

# taking just the useful data to create our feature vector.
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]

# transforming data to have more useful features
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0 #we may ignore 100
df['PCT_Change'] = (df['Adj. Open'] - df['Adj. Close']) / df['Adj. Close'] * 100.0

# creating final features
df = df[['Adj. Close', 'HL_PCT', 'PCT_Change', 'Adj. Volume']]

print(df.head())


            Adj. Close    HL_PCT  PCT_Change  Adj. Volume
Date                                                     
2004-08-19   50.322842  8.441017   -0.323915   44659000.0
2004-08-20   54.322689  8.537313   -6.739913   22834300.0
2004-08-23   54.869377  4.062357    1.243144   18256100.0
2004-08-24   52.597363  7.753210    6.074187   15247300.0
2004-08-25   53.164113  3.966115   -1.169811    9188600.0

In [2]:
import math
print('total data points: '+str(len(df)))

forecast_col = 'Adj. Close'

# filling in NA data points
df.fillna(-99999, inplace=True) #outlier in our dataset

forecast_out = int(math.ceil(0.01*len(df)))

# Creating lables which are the predictions.
# here, each row has lable col is Adj. close price
# forecast_out days in future.
df['label'] = df[forecast_col].shift(-forecast_out)

print('prediction is done for '
      + str(forecast_out)+ ' days \n')

print(df.head())


total data points: 3241
prediction is done for 33 days 

            Adj. Close    HL_PCT  PCT_Change  Adj. Volume      label
Date                                                                
2004-08-19   50.322842  8.441017   -0.323915   44659000.0  68.752232
2004-08-20   54.322689  8.537313   -6.739913   22834300.0  69.639972
2004-08-23   54.869377  4.062357    1.243144   18256100.0  69.078238
2004-08-24   52.597363  7.753210    6.074187   15247300.0  67.839414
2004-08-25   53.164113  3.966115   -1.169811    9188600.0  68.912727

In [3]:
import numpy as np
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle

'''
Creating input feature vectors by dropping the labels
'''
X = np.array(df.drop(['label'], 1))

'''
Scaling the input features
'''
X = preprocessing.scale(X)

'''
Creating input features to predict values.
'''
X_lately = X[-forecast_out:] # will use to predict against
X = X[:-forecast_out] # will use to train/test against

df.dropna(inplace=True)
y = np.array(df['label'])

print('training data input size='+str(len(X)),
      'and label size='+str(len(y)),
      ' should be equal')

# Splitting the dataset into train and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# creating Linear Regression model

clf = LinearRegression(n_jobs=-1)
#clf = svm.SVR(kernel='linear')

# training the model to fit the data set
clf.fit(X_train, y_train)

with open('classifier.pickle', 'wb') as f:
    pickle.dump(clf, f)
    
pickle_in = open('classifier.pickle', 'rb')
clf = pickle.load(pickle_in)

# testing the accuracy of the trained model
accuracy = clf.score(X_test, y_test)

print('accuracy "Squared Error" of our trained model is '
      +str(accuracy) + '\n\n')

'''
Lets do some prediction
'''

predictions = clf.predict(X_lately)

print(predictions)


training data input size=3208 and label size=3208  should be equal
accuracy "Squared Error" of our trained model is 0.975044751473


[  957.41017385   966.9376067    972.03658538   981.403719     987.68498996
   995.38238111  1007.89759463  1010.70735821  1014.01866556  1003.31753096
  1005.4120939   1013.46031171  1021.0543115   1013.59150768  1019.28487591
  1021.81517399   981.05637138   975.12507375   986.71011056   983.69247443
   975.45360788   975.22074405   992.23221919   985.87773011   995.77491613
   994.35751371  1003.0166906    987.50348873   963.47595913   975.23162405
   952.6057195    945.62678091   934.90750979]

In [4]:
import datetime
import matplotlib.pyplot as plt
from matplotlib import style

style.use('ggplot')

df['Predictions'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day

for p in predictions:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [p]
    
    
df['Adj. Close'].plot()
df['Predictions'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()