In [21]:
%matplotlib inline
import timeseries, regression
import thinkstats2, thinkplot
import pandas
import matplotlib.pyplot as pyplot
import numpy as np
import statsmodels.formula.api as smf
Time Series is a sequence of measurements taken from a system that varies in time.
In [23]:
transactions = pandas.read_csv('mj-clean.csv', parse_dates=[5])
dailies = timeseries.GroupByQualityAndDay(transactions)
In [24]:
def PlotDailies(dailies):
thinkplot.PrePlot(rows=3)
for i, (name, daily) in enumerate(dailies.items()):
thinkplot.SubPlot(i+1)
title = 'price per gram ($)' if i == 0 else ''
thinkplot.Config(ylim=[0, 20], title=title)
thinkplot.Scatter(daily.ppg, s=10, label=name)
if i == 2:
pyplot.xticks(rotation=30)
else:
thinkplot.Config(xticks=[])
thinkplot.Show()
PlotDailies(dailies)
In [91]:
def RunLinearModel(daily):
model = smf.ols('ppg ~ years', data=daily)
results = model.fit()
return model, results
for name, daily in dailies.items():
model, results = RunLinearModel(daily)
print
print name
regression.SummarizeResults(results)
In [27]:
def PlotFittedValues(model, results, label=''):
years = model.exog[:, 1]
values = model.endog
thinkplot.Scatter(years, values, s=15, label=label)
thinkplot.Plot(years, results.fittedvalues, label='model')
In [29]:
high = dailies['high']
model, results = RunLinearModel(high)
PlotFittedValues(model, results, "high")
thinkplot.Show()
Even though this is a good linear fit, we shouldn't trust it, because:
there is no reason to expect a long term trend to be a line or any other simple function.
the linear regression model gives equal weight to all data. We should probably give more weight to recent data.
one of the assumptions of linear regression is that residuals are uncorrelated noise. This is probably false here, because successive values are probably correlated.
Modeling assumption in time series analysis is that the observed series is the sum of:
moving average divides the series into overlapping regions, called windows and computes the average of the values in each window.
rolling mean computes the mean of the values in each window.
In [30]:
series = np.arange(10)
pandas.rolling_mean(series, 3)
Out[30]:
In [34]:
def PlotRollingMean(daily):
##since there are missing dates, we have to reindex the df
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
roll_mean = pandas.rolling_mean(reindexed.ppg, 30)
thinkplot.Plot(roll_mean.index, roll_mean)
pyplot.xticks(rotation=30)
PlotRollingMean(high)
exponentially-weighted moving average - most recent value has highest weight and weights of previous values drop off exponentially.
In [35]:
def PlotEWMA(daily):
##since there are missing dates, we have to reindex the df
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=30)
thinkplot.Plot(ewma.index, ewma)
pyplot.xticks(rotation=30)
PlotEWMA(high)
In [71]:
def FillMissing(daily):
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
ewma = pandas.ewma(reindexed.ppg, span=30)
##drawback is that it understates the noise in the series
##this function simulates noise by resampling
resid = (reindexed.ppg - ewma).dropna()
fake_data = ewma + thinkstats2.Resample(resid, len(reindexed))
reindexed.ppg.fillna(fake_data, inplace=True)
return reindexed
high2 = FillMissing(high)
In [65]:
PlotEWMA(high2)
serial correlation - each value is correlated with the next one in the series. To compute, we can shift the correlation by an interval called lag
In [75]:
##will return a correlation between 0 and 1
def SerialCorr(series, lag=1):
xs = series[lag:]
ys = series.shift(lag)[lag:]
corr = thinkstats2.Corr(xs, ys)
return corr
ewma = pandas.ewma(high2.ppg, span=30)
resid = high2.ppg - ewma
corr = SerialCorr(resid, 365)
corr
Out[75]:
autocorrelation function - maps from lag to serial correlation with the given lag.
In [79]:
import statsmodels.tsa.stattools as smtsa
##unbiased corrects for sample size
acf = smtsa.acf(resid, nlags=365, unbiased=True)
In [104]:
def GenerateSimplePrediction(results, years):
n = len(years)
inter = np.ones(n)
d = dict(Intecept=inter, years=years)
predict_df = pandas.DataFrame(d)
predict = results.predict(predict_df)
return predict
years = np.linspace(0, 5, 101)
simplePrediction = GenerateSimplePrediction(results, years)
thinkplot.Plot(years, simplePrediction)
This graph needs to take into account:
Sampling Error: the prediction is based on estimated parameters, which are liable to change if we run the experiment again.
Random Variation: Even if estimated parameters are perfect, the observed data varies randomly around the long-term trend.
Modeling Error: predictions based on a linear model will eventually fail.
Unexpected Future events.
In [ ]:
In [100]:
"""
to quantify sampling error:
assume estimated parameters are correct
but residual could have been different
use resampling to rerun experiment
"""
def SimulateResults(daily, iters=101):
model, results = RunLinearModel(daily)
fake = daily.copy()
result_seq = []
for i in range(iters):
fake.ppg = results.fittedvalues + thinkstats2.Resample(results.resid)
_, fake_results = RunLinearModel(fake)
result_seq.append(fake_results)
return result_seq
def GeneratePredictions(result_seq, years, add_resid=False):
n = len(years)
d = dict(Intercept=np.ones(n), years=years, years2=years**2)
predict_df = pandas.DataFrame(d)
predict_seq = []
for fake_results in result_seq:
predict = fake_results.predict(predict_df)
if add_resid:
predict += thinkstats2.Resample(fake_results.resid, n)
predict_seq.append(predict)
return predict_seq
def PlotPredictions(daily, years, iters=101, percent=90):
result_seq = SimulateResults(daily, iters=iters)
p = (100-percent) / 2
percents = p, 100-p
predict_seq = GeneratePredictions(result_seq, years, add_resid=True)
low, high = thinkstats2.PercentileRows(predict_seq, percents)
thinkplot.FillBetween(years, low, high, alpha=0.3, color='gray')
predict_seq = GeneratePredictions(result_seq, years, add_resid=False)
low, high = thinkstats2.PercentileRows(predict_seq, percents)
thinkplot.FillBetween(years, low, high, alpha=0.5, color='gray')
years = np.linspace(0, 5, 101)
print "high"
PlotPredictions(dailies['high'], years)
thinkplot.figure()
print "mid"
PlotPredictions(dailies['medium'], years)
thinkplot.figure()
print "low"
PlotPredictions(dailies['low'], years)
the dark gray region represents a 90% confidence interval for sampling error (that is uncertainty about the estimated slope and intercept due to sampling)
the lighter region shows confidence interval for prediction error., which is the sum of the sampling error due to random variation.
In [98]:
dailies.keys()
Out[98]:
In [ ]: