In [3]:
%matplotlib inline
from __future__ import print_function
import pandas
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.tsa.stattools as smtsa
import matplotlib.pyplot as pyplot
import thinkplot
import thinkstats2
import regression
FORMATS = ['png']
In [4]:
transactions = pandas.read_csv('mj-clean.csv', parse_dates=[5])
In [5]:
transactions.columns
Out[5]:
In [6]:
transactions.head()
Out[6]:
In [7]:
transactions.shape
Out[7]:
In [8]:
def GroupByQualityAndDay(transactions):
groups = transactions.groupby('quality')
dailies = {}
for name, group in groups:
dailies[name] = GroupByDay(group)
return dailies
def GroupByDay(transactions, func=np.mean):
grouped = transactions[['date', 'ppg']].groupby('date')
daily = grouped.aggregate(func)
daily['date'] = daily.index
start = daily.date[0]
one_year = np.timedelta64(1, 'Y')
daily['years'] = (daily.date - start) / one_year
return daily
In [9]:
dailies = GroupByQualityAndDay(transactions)
dailies
Out[9]:
In [8]:
thinkplot.PrePlot(rows=3)
for i, (name, daily) in enumerate(dailies.items()):
thinkplot.SubPlot(i+1)
title = 'price per gram ($)' if i == 0 else ''
thinkplot.Config(ylim=[0,20], title=title)
thinkplot.Scatter(daily.index, daily.ppg, s=10, label=name)
if i == 2:
pyplot.xticks(rotation=30)
else:
thinkplot.Config(xticks=[])
In [9]:
def RunLinearModel(daily):
model = smf.ols('ppg ~ years', data=daily)
results = model.fit()
return model, results
In [11]:
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
thinkplot.SubPlot(i)
model, results = RunLinearModel(daily)
PlotFittedValues(model, results, label=name)
i += 1
In [10]:
def PlotFittedValues(model, results, label=''):
years = model.exog[:,1]
values = model.endog
thinkplot.Scatter(years, values, s=15, label=label)
thinkplot.Plot(years, results.fittedvalues, label='model')
In [12]:
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
thinkplot.SubPlot(i)
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=30)
res = (reindexed.ppg - ewma).dropna()
fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
reindexed.ppg.fillna(fake_data, inplace=True)
thinkplot.Scatter(reindexed.index, reindexed.ppg, s=15, label=name)
thinkplot.Plot(ewma.index, ewma)
i += 1
In [13]:
def SerialCorr(series, lag=1):
xs = series[lag:]
ys = series.shift(lag)[lag:]
corr = thinkstats2.Corr(xs, ys)
return corr
In [14]:
lag_corr = {}
nlags = 365
thinkplot.PrePlot(rows = 3)
i = 1
for name, daily in dailies.items():
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=30)
res = (reindexed.ppg - ewma).dropna()
fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
reindexed.ppg.fillna(fake_data, inplace=True)
rolling_mean = pandas.rolling_mean(reindexed.ppg, 30)
ewma = pandas.ewma(reindexed.ppg, span=30)
res = (reindexed.ppg - ewma).dropna()
lag_corr[name] = []
acf = smtsa.acf(res, nlags=nlags, unbiased=True)
thinkplot.SubPlot(i)
SimulateAutocorrelation(daily, nlags=nlags)
thinkplot.Plot(acf, label=name)
i += 1
thinkplot.Show(axis=[0,nlags,-.1,.1])
In [15]:
def SimulateAutocorrelation(daily, iters=1001, nlags=40):
"""Resample residuals, compute autocorrelation, and plot percentiles.
daily: DataFrame
iters: number of simulations to run
nlags: maximum lags to compute autocorrelation
"""
# run simulations
t = []
for _ in range(iters):
filled = FillMissing(daily, span=30)
resid = thinkstats2.Resample(filled.resid)
acf = smtsa.acf(resid, nlags=nlags, unbiased=True)[1:]
t.append(np.abs(acf))
high = thinkstats2.PercentileRows(t, [97.5])[0]
low = -high
lags = range(1, nlags+1)
thinkplot.FillBetween(lags, low, high, alpha=0.2, color='gray')
In [16]:
def FillMissing(daily, span=30):
"""Fills missing values with an exponentially weighted moving average.
Resulting DataFrame has new columns 'ewma' and 'resid'.
daily: DataFrame of daily prices
span: window size (sort of) passed to ewma
returns: new DataFrame of daily prices
"""
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=span)
resid = (reindexed.ppg - ewma).dropna()
fake_data = ewma + thinkstats2.Resample(resid, len(reindexed))
reindexed.ppg.fillna(fake_data, inplace=True)
reindexed['ewma'] = ewma
reindexed['resid'] = reindexed.ppg - ewma
return reindexed
In [17]:
def AddWeeklySeasonality(daily, inc=2):
frisat = (daily.index.dayofweek == 4) | (daily.index.dayofweek == 5)
fake = daily.copy()
fake.ppg[frisat] += np.random.uniform(0, inc, frisat.sum())
return fake
In [83]:
lag_corr = {}
nlags = 40
thinkplot.PrePlot(rows = 3)
i = 1
for name, daily in dailies.items():
fdaily = AddWeeklySeasonality(daily, 2)
dates = pandas.date_range(fdaily.index.min(), fdaily.index.max())
reindexed = fdaily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=30)
res = (reindexed.ppg - ewma).dropna()
fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
reindexed.ppg.fillna(fake_data, inplace=True)
rolling_mean = pandas.rolling_mean(reindexed.ppg, 30)
ewma = pandas.ewma(reindexed.ppg, span=30)
res = (reindexed.ppg - ewma).dropna()
lag_corr[name] = []
acf = smtsa.acf(res, nlags=nlags, unbiased=True)
thinkplot.SubPlot(i)
SimulateAutocorrelation(fdaily, nlags=nlags)
thinkplot.Plot(acf, label=name)
i += 1
thinkplot.Show(axis=[0,nlags,-.1,.1])
In [18]:
def GenerateSimplePrediction(results, years):
"""Generates a simple prediction.
results: results object
years: sequence of times (in years) to make predictions for
returns: sequence of predicted values
"""
n = len(years)
inter = np.ones(n)
d = dict(Intercept=inter, years=years)
predict_df = pandas.DataFrame(d)
predict = results.predict(predict_df)
return predict
In [19]:
def RunQuadraticModel(daily):
daily["years2"] = daily.years**2
model = smf.ols('ppg ~ years + years2', data=daily)
results = model.fit()
return model, results
In [38]:
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
thinkplot.SubPlot(i)
model, results = RunPolyModel(daily,2)
print(results.summary())
PlotFittedValues(model, results, label=name)
i += 1
In [29]:
def RunPolyModel(daily, pow=1):
formula = 'ppg ~ years'
if pow > 1:
for i in xrange(2,pow + 1):
name = "years" + str(i)
daily[name] = daily.years**i
formula += ' + ' + name
model = smf.ols(formula, data=daily)
results = model.fit()
return model, results
In [39]:
def GenerateSimplePrediction(results, years):
"""Generates a simple prediction.
results: results object
years: sequence of times (in years) to make predictions for
returns: sequence of predicted values
"""
n = len(years)
inter = np.ones(n)
d = dict(Intercept=inter, years=years, years2=years**2)
print(d)
# predict_df = pandas.DataFrame(d)
# predict = results.predict(predict_df)
# print(predict)
# return predict
In [41]:
GenerateSimplePrediction(results, [1])
In [58]:
def GeneratePredictions(result_seq, years, add_resid=False):
"""Generates an array of predicted values from a list of model results.
When add_resid is False, predictions represent sampling error only.
When add_resid is True, they also include residual error (which is
more relevant to prediction).
result_seq: list of model results
years: sequence of times (in years) to make predictions for
add_resid: boolean, whether to add in resampled residuals
returns: sequence of predictions
"""
n = len(years)
d = dict(Intercept=np.ones(n), years=years, years2=years**2)
predict_df = pandas.DataFrame(d)
predict_seq = []
for fake_results in result_seq:
predict = fake_results.predict(predict_df)
if add_resid:
predict += thinkstats2.Resample(fake_results.resid, n)
predict_seq.append(predict)
return predict_seq
In [48]:
def SimulateResults(daily, iters=101, func=RunLinearModel):
"""Run simulations based on resampling residuals.
daily: DataFrame of daily prices
iters: number of simulations
func: function that fits a model to the data
returns: list of result objects
"""
_, results = func(daily)
fake = daily.copy()
result_seq = []
for _ in range(iters):
fake.ppg = results.fittedvalues + thinkstats2.Resample(results.resid)
_, fake_results = func(fake)
result_seq.append(fake_results)
return result_seq
In [49]:
def PlotPredictions(daily, years, iters=101, percent=90, func=RunLinearModel):
"""Plots predictions.
daily: DataFrame of daily prices
years: sequence of times (in years) to make predictions for
iters: number of simulations
percent: what percentile range to show
func: function that fits a model to the data
"""
result_seq = SimulateResults(daily, iters=iters, func=func)
p = (100 - percent) / 2
percents = p, 100-p
predict_seq = GeneratePredictions(result_seq, years, add_resid=True)
low, high = thinkstats2.PercentileRows(predict_seq, percents)
thinkplot.FillBetween(years, low, high, alpha=0.3, color='gray')
predict_seq = GeneratePredictions(result_seq, years, add_resid=False)
low, high = thinkstats2.PercentileRows(predict_seq, percents)
thinkplot.FillBetween(years, low, high, alpha=0.5, color='gray')
In [59]:
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
thinkplot.SubPlot(i)
start = daily.years.min()
PlotPredictions(daily=daily, years=np.arange(start, 5, 0.01), func=RunQuadraticModel)
thinkplot.Scatter(daily.years, daily.ppg)
i += 1
In [84]:
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
thinkplot.SubPlot(i)
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=30)
res = (reindexed.ppg - ewma).dropna()
fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
reindexed.ppg.fillna(fake_data, inplace=True)
reindexed.date = reindexed.index
start = daily.date[0]
one_year = np.timedelta64(1, 'Y')
reindexed.years = (reindexed.date - start) / one_year
start = reindexed.years.min()
PlotPredictions(daily=reindexed, years=np.arange(start, 5, 0.01), func=RunQuadraticModel)
thinkplot.Scatter(reindexed.years, reindexed.ppg)
i += 1
In [131]:
class SerialCorrelationTest(thinkstats2.HypothesisTest):
"""Tests serial correlations by permutation."""
def __init__(self, data, lag):
self.data = ReindexData(data)
self.lag = lag
self.actual = self.TestStatistic(self.data)
def TestStatistic(self, data):
"""Computes the test statistic.
data: tuple of xs and ys
"""
test_stat = abs(thinkstats2.SerialCorr(data.ppg, self.lag))
return test_stat
def RunModel(self):
"""Run the model of the null hypothesis.
returns: simulated data
"""
test_data = self.data.copy()
test_ppg = thinkstats2.Resample(self.data.ppg)
test_data.ppg = test_ppg
return test_data
In [14]:
def ReindexData(daily):
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=30)
res = (reindexed.ppg - ewma).dropna()
fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
reindexed.ppg.fillna(fake_data, inplace=True)
return reindexed
In [134]:
for name, daily in dailies.items():
sct = SerialCorrelationTest(daily, 1)
print(name, '{0:.5f}'.format(sct.PValue(iters=5000)))
In [135]:
class SerialCorrelationTest2(thinkstats2.HypothesisTest):
"""Tests serial correlations by permutation."""
def TestStatistic(self, data):
"""Computes the test statistic.
data: tuple of xs and ys
"""
series, lag = data
test_stat = abs(thinkstats2.SerialCorr(series, lag))
return test_stat
def RunModel(self):
"""Run the model of the null hypothesis.
returns: simulated data
"""
series, lag = self.data
permutation = series.reindex(np.random.permutation(series.index))
return permutation, lag
In [118]:
for name, daily in dailies.items():
sct = SerialCorrelationTest2((daily.ppg, 1))
print(name, '{0:.3f}'.format(sct.PValue(iters=5000)))
In [12]:
def EWMAPredict(daily, date):
results = ReindexData(daily.copy())
date_ranges = pandas.date_range(results.index.max(), date)
for day in date_ranges:
ewma = pandas.ewma(results.ppg, span=30)
inter = ewma[-1]
ewma_slope = pandas.ewma(results.ppg.diff(), span=30).dropna()
slope = ewma_slope[-1]
results.reindex(pandas.date_range(results.index.min(), day))
results.ppg[-1] = inter + slope
results = results[daily.index.max():]
return results
In [15]:
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
thinkplot.SubPlot(i)
results = EWMAPredict(daily, '2015-01-01')
thinkplot.Scatter(daily.index, daily.ppg)
thinkplot.Plot(results.index, results.ppg)
i += 1
In [10]:
for name, daily in dailies.items():
print(daily.index.max())
In [ ]: