In [79]:
series = np.arange(10)
series
Out[79]:
In [80]:
series = np.arange(10)
pandas.rolling_mean(series, 3)
Out[80]:
In [231]:
import pandas
import thinkplot
import thinkstats2
import datetime
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
import statsmodels.tsa.stattools as smtsa
import timeseries
import scipy.signal as sig
transactions = timeseries.ReadData()
dailies = timeseries.GroupByQualityAndDay(transactions)
name = 'high'
daily = dailies[name]
thinkplot.Scatter(daily.index, daily.ppg)
series = daily.ppg.diff()[1:]
permutation = series.reindex(np.random.permutation(series.index))
thinkstats2.SerialCorr(series, 1), thinkstats2.SerialCorr(permutation, 1)
Out[231]:
In [206]:
starts = np.linspace(0, len(daily), 101).astype(int)
for start in starts[:-2]:
fake = daily[start:]
_, fake_results = timeseries.RunLinearModel(fake)
fake
Out[206]:
In [82]:
dates = pandas.date_range(daily.index.min(), daily.index.max())
daily_missing = daily.reindex(dates)
daily_missing.ppg.isnull().sum()
Out[82]:
In [83]:
roll_mean = pandas.rolling_mean(daily_missing.ppg, 30, min_periods=5)
thinkplot.Scatter(roll_mean.index, roll_mean)
roll_mean.isnull().sum()
Out[83]:
In [84]:
ma = pandas.ewma(daily_missing.ppg, span=30)
thinkplot.Scatter(ma.index, roll_mean)
ma.isnull().sum()
Out[84]:
In [85]:
trend = pandas.ewma(daily_missing.ppg.diff(), 180)
thinkplot.Scatter(trend.index, trend)
trend
Out[85]:
In [86]:
(232-109)/15.0
Out[86]:
In [87]:
roll_mean2 = roll_mean.interpolate(method='time')
roll_mean2 = roll_mean2.fillna(method='backfill')
thinkplot.Scatter(roll_mean2.index, roll_mean2)
roll_mean2.isnull().sum()
Out[87]:
In [88]:
resid = daily_missing.ppg - roll_mean2
roll_resid = resid.dropna()
thinkplot.Scatter(roll_resid.index, roll_resid)
roll_resid.isnull().sum()
Out[88]:
In [89]:
daily_missing['resid'] = daily_missing.ppg - roll_mean2
roll_resid = daily_missing.resid.dropna()
thinkplot.Scatter(roll_resid.index, roll_resid)
roll_resid.isnull().sum()
Out[89]:
In [90]:
fake_resid = np.random.choice(roll_resid, len(daily_missing), replace=True)
daily_missing.ppg = daily_missing.ppg.fillna(roll_mean2 + fake_resid)
thinkplot.Scatter(daily_missing.index, daily_missing.ppg)
daily_missing.isnull().sum()
Out[90]:
In [154]:
thinkstats2.RandomSeed(17)
filled = timeseries.FillMissing(daily)
sum(filled.ppg.isnull())
lag=92
xs = filled.resid[lag:]
ys = filled.resid.shift(lag)[lag:]
thinkstats2.Corr(xs, ys)
Out[154]:
In [155]:
df = pandas.DataFrame(dict(xs=xs, ys=ys))
df.corr()['xs']['ys']
Out[155]:
In [161]:
acf = smtsa.acf(filled.resid, nlags=100, unbiased=True)
#thinkplot.Plot(acf)
acf[0], acf[1], acf[7], acf[30]
Out[161]:
In [160]:
acf = smtsa.acf(filled.resid, nlags=100, unbiased=False)
#thinkplot.Plot(acf)
acf[0], acf[1], acf[7], acf[30]
Out[160]:
In [194]:
thinkstats2.RandomSeed(17)
filled = timeseries.FillMissing(daily)
filled['slope'] = pandas.ewma(filled.ppg.diff(), 500)
filled[-1:]
start = filled.index[-1]
inter = filled.ewma[-1]
slope = filled.slope[-1]
dates = pandas.date_range(filled.index.min(), daily.index.max()+np.timedelta64(365, 'D'))
predicted = filled.reindex(dates)
predicted['date'] = predicted.index
one_day = np.timedelta64(1, 'D')
predicted['days'] = (predicted.date - start) / one_day
predict = inter + slope * predicted.days
predicted.ewma.fillna(predict, inplace=True)
thinkplot.Plot(predicted.ewma)
In [12]:
daily_missing['ppg1'] = daily_missing.ppg.shift(-1)
daily_missing['ppg7'] = daily_missing.ppg.shift(-7)
daily_missing['ppg30'] = daily_missing.ppg.shift(-30.5)
daily_missing['ppg365'] = daily_missing.ppg.shift(-365)
ppg_mean = pandas.rolling_mean(daily_missing.ppg, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg_mean)
ppg7_mean = pandas.rolling_mean(daily_missing.ppg7, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg7_mean)
ppg30_mean = pandas.rolling_mean(daily_missing.ppg30, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg30_mean)
ppg365_mean = pandas.rolling_mean(daily_missing.ppg365, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg365_mean)
daily_missing[['ppg', 'ppg1', 'ppg7', 'ppg30', 'ppg365']].corr()
Out[12]:
In [13]:
# TODO: run this analysis on the residuals
daily_missing['resid1'] = daily_missing.resid.shift(-1)
daily_missing['resid7'] = daily_missing.resid.shift(-5)
daily_missing['resid30'] = daily_missing.resid.shift(-30.5)
daily_missing['resid365'] = daily_missing.resid.shift(-365)
resid_mean = pandas.rolling_mean(daily_missing.resid, 30, center=True)
thinkplot.Plot(daily_missing.index, resid_mean)
resid7_mean = pandas.rolling_mean(daily_missing.resid7, 30, center=True)
thinkplot.Plot(daily_missing.index, resid7_mean)
resid30_mean = pandas.rolling_mean(daily_missing.resid30, 30, center=True)
thinkplot.Plot(daily_missing.index, resid30_mean)
resid365_mean = pandas.rolling_mean(daily_missing.resid365, 30, center=True)
thinkplot.Plot(daily_missing.index, resid365_mean)
daily_missing[['resid', 'resid1', 'resid7', 'resid30', 'resid365']].corr()
In [30]:
model = smf.ols('resid ~ resid365', data=daily_missing)
results = model.fit()
results.summary()
Out[30]:
In [22]:
model = smf.ols('ppg ~ ppg1 + ppg7 + ppg30', data=daily_missing)
results = model.fit()
results.summary()
Out[22]:
In [35]:
thinkplot.Scatter(daily.index, daily.ppg, alpha=0.1)
triangle = sig.get_window('triangle', 30)
triangle_mean = triangle.mean()
roll_mean = pandas.rolling_window(daily.ppg, 30, 'triang') / triangle_mean
thinkplot.Plot(daily.index, roll_mean)
In [ ]:
In [24]:
model = tsa.arima_model.ARIMA(daily.ppg, (7,1,2))
In [23]:
results = model.fit()
results.summary()
Out[23]:
In [196]:
year11 = ppg[121:121+365]
#year11 = year11.reindex(range(0, 365))
year11
year11.date[121], year11.date[121+364]
Out[196]:
In [210]:
thinkplot.Plot(year11.index, year11.ppg)
roll_mean = pandas.rolling_mean(year11.ppg, 30, center=True)
thinkplot.Plot(year11.index, roll_mean, color='yellow')
year11.date[250], year11.date[350], year11.date[430]
Out[210]:
In [300]:
import scipy.signal as sig
gaussian = sig.get_window(('gaussian', 7.5), 30)
gaussian_mean = gaussian.mean()
gaussian /= gaussian.mean()
thinkplot.Plot(gaussian)
boxcar = sig.get_window('boxcar', 30)
boxcar_mean = boxcar.mean()
boxcar /= boxcar.mean()
thinkplot.Plot(boxcar)
triangle = sig.get_window('triangle', 30)
triangle_mean = triangle.mean()
triangle /= triangle.mean()
thinkplot.Plot(triangle)
In [301]:
thinkplot.Plot(year11.index, year11.ppg)
roll_mean = pandas.rolling_window(year11.ppg, 30, 'triang', center=True)
roll_mean /= triangle_mean
thinkplot.Plot(year11.index, roll_mean, color='yellow')
year11.date[250], year11.date[350], year11.date[430]
Out[301]:
In [303]:
thinkplot.Plot(year11.index, year11.ppg)
roll_mean = pandas.rolling_window(year11.ppg, 30, 'gaussian', std=7.5, center=True)
roll_mean /= gaussian_mean
thinkplot.Plot(year11.index, roll_mean, color='yellow')
year11.date[250], year11.date[350], year11.date[430]
Out[303]:
In [113]:
low, high = np.amin(ppg.index.values), np.amax(ppg.index.values)
ppg = ppg.reindex(np.arange(low, high+1))
thinkplot.Plot(ppg.index, ppg.ppg)
In [114]:
ppg['years'] = ppg.index / 365.0
model = smf.ols('ppg ~ years', data=ppg)
results = model.fit()
results.summary()
Out[114]:
In [343]:
xs = model.exog[:,1]
ys = results.resid
df = pandas.DataFrame(dict(xs=xs, ys=ys))
df
Out[343]:
In [115]:
years = model.exog[:,1]
thinkplot.Plot(years, results.resid)
In [139]:
ppg['years'] = ppg.index / 365.0
ppg['years2'] = ppg.years**2
model = smf.ols('ppg ~ years + years2', data=ppg)
results = model.fit()
results.summary()
Out[139]:
In [140]:
years = model.exog[:,1]
thinkplot.Plot(years, results.resid)
In [144]:
ppg.loc[100]
Out[144]:
In [116]:
thinkstats2.Hist(np.diff(ppg.index.values)).Render()
Out[116]:
In [263]:
from pandas.tools.plotting import autocorrelation_plot
autocorrelation_plot(ppg.ppg.diff().dropna())
Out[263]:
In [118]:
high = df[df.quality=='high']
ca_high = high[high.state=='CA']
thinkplot.Scatter(ca_high.days, ca_high.ppg, alpha=0.05)
ma_high = high[high.state=='MA']
thinkplot.Scatter(ma_high.days, ma_high.ppg, alpha=0.05, color='red')
In [120]:
x = np.random.randn(100)
#x[50] = np.nan
y = np.random.randn(100)
np.corrcoef(x, y, ddof=0)
Out[120]:
In [121]:
x = pandas.Series(np.random.randn(100))
x[50] = np.nan
y = pandas.Series(np.random.randn(100))
x.corr(y)
Out[121]:
In [129]:
x = pandas.Series(np.random.randn(10))
#x[5] = np.nan
x1 = x.shift(1)
x
Out[129]:
In [130]:
x1
Out[130]:
In [131]:
x.corr(x1)
Out[131]:
In [132]:
x2 = x.shift(2)
x.corr(x2)
Out[132]:
In [134]:
smtsa.acf(x)
Out[134]:
In [291]:
n = 10
ser = pandas.Series(np.ones(n))
mean = pandas.rolling_window(ser, 5, 'triang').mean()
np.testing.assert_approx_equal(mean, 1.0)
In [285]:
mean = pandas.rolling_window(ser, 5, 'triang').mean()
np.testing.assert_approx_equal(mean, 1.0)
In [288]:
mean = pandas.rolling_window(ser, 5, 'gaussian', std=1.5).mean()
np.testing.assert_approx_equal(mean, 1.0)
In [289]:
mean = pandas.rolling_window(ser, 5, 'boxcar').mean()
np.testing.assert_approx_equal(mean, 1.0)
In [ ]: