In [ ]:
%matplotlib inline
import pandas as pd

In [ ]:
f = open('apple_stock_snowday.json')
import json
data = pd.DataFrame( (json.loads(tick) for tick in f) ) #Expression in the ( ) is a python generator
data

In [ ]:
prices = data.lastSalePrice #we only really care about one column, most recent price
prices.plot()

In [ ]:
#We don't care about the overall price, we're trying to predict small fluxuations
diffs = prices.diff() #Subtract each from the one before it
diffs = diffs[ diffs != 0 ] #Not all rows have a price change, sometimes something else was updated.
diffs.plot()
#Stationary time series!

In [ ]:
original_diffs = diffs.copy()
diffs = diffs.apply(lambda x: x > 0) #Simplify things into just positive/negative as True/False
diffs.hist()

In [ ]:
def training_generator():
    window_size = 5
    for i in range(0, len(diffs) - window_size):
        yield {j: diffs.iloc[i+j] for j in range(window_size)}

In [ ]:
generator = training_generator()
df = pd.DataFrame(generator)
df

In [ ]:
examples = df[[0,1,2,3]]
answers = df[[4]][4]

In [ ]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [ ]:
clf.fit(examples, answers)
predictions = clf.predict(examples)
predictions

In [ ]:
successes = predictions == answers
successes

In [ ]:
sum(successes) / len(examples) #Did we do better than random guessing?

In [ ]:
#How much can we make?
#we'll make the gaps as profit by buying/selling as appropriate, regardless of whether they were up or down
potential_profits = original_diffs.abs() 
#first four rows were for training, [5] is the first we actually predicted
potential_profits = potential_profits[5:]
potential_profits = potential_profits.reset_index().lastSalePrice

profits = sum(potential_profits[successes])
losses = sum(potential_profits[~successes])
num_shares = 1000
net = (profits - losses) * num_shares
round(net)