notebook.community

Edit and run



In [ ]:

    
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import quandl, math, datetime
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
from matplotlib import style
import warnings

%matplotlib inline

style.use('ggplot')

##Github Issues Resolved
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
warnings.filterwarnings(action="ignore", module="scipy", message="^This module")




style.use('fivethirtyeight')

### Quandl Dataset
df = quandl.get('WIKI/GOOGL')

#print(df.tail())

df = df[['Adj. Open','Adj. High','Adj. Low','Adj. Close','Adj. Volume']]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

df = df[['Adj. Close','HL_PCT','PCT_change','Adj. Volume']]

forecast_col = 'Adj. Close' #Label variable

df.fillna(-99999,inplace=True)

forecast_out = int(math.ceil(0.01*len(df)))
print(df.tail())
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
print(df.tail())

xs = np.array(df.drop(['label'],1),dtype=np.float64)
ys = np.array(df['label'],dtype=np.float64)
###

xs_lately = xs[-forecast_out:]
xs = xs[:-forecast_out]
ys = ys[:-forecast_out]


m, b = best_fit_slope_and_intercept(xs, ys)
# print(m)
# print(b)

regression_line = [(m*x) + b for x in xs]

predict_x = 8
predict_y = (m*predict_x) + b
r_squared = coefficient_of_determination(ys, regression_line)
print(r_squared)

plt.scatter(xs, ys)
plt.scatter(predict_x, predict_y, s=150, color='g') # s is Size
plt.plot(xs, regression_line)
plt.show()



In [ ]:

    
def create_dataset(hm, variance, step=2, correlation=False):
    val = 1
    ys = []
    for i in range(hm):
        y = val + random.randrange(-variance, variance)
        ys.append(y)
        if correlation and correlation == 'pos':
            val += step
        elif correlation and correlation == 'neg':
            val -= step
    xs = [i for i in range(len(ys))]
    return np.array(xs, dtype=np.float64) , np.array(ys, dtype=np.float64)

 def best_fit_slope_and_intercept(xs, ys):
    m = ( ((mean(xs) * mean(ys)) - mean(xs*ys)) /
        ((mean(xs)*mean(xs)) - mean(xs*xs)) )
    
    b  = mean(ys) - m*mean(xs)
    return m, b


def squared_error(ys_orig, ys_line):
    return sum((ys_line - ys_orig) **2)

def coefficient_of_determination(ys_orig, ys_line):
    y_mean_line = [mean(ys_orig) for y in ys_orig]
    squared_error_regr = squared_error(ys_orig, ys_line)
    squared_error_y_mean = squared_error(ys_orig, y_mean_line)
    return 1 - (squared_error_regr / squared_error_y_mean)



In [ ]:

    
## Quandl Code
%matplotlib notebook



### 2D Linear Regression
m, b = best_fit_slope_and_intercept(xs.flatten(), ys.flatten())
regression_line = [(m*x) + b for x in xs]
regression_line = np.array(regression_line,dtype=np.float64)
# accuracy
r_squared = coefficient_of_determination(ys.flatten(), regression_line.flatten())
print(r_squared)
###

### Prediction
forecast_set = np.array([(m*x) + b for x in xs_lately],dtype=np.float64)
# print(forecast_set)
###

### Plotting time and forecasting
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day

for i in forecast_set.flatten():
	next_date = datetime.datetime.fromtimestamp(next_unix)
	next_unix += one_day
	df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]

df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()