notebook.community

Edit and run



In [3]:

    
%matplotlib inline

from __future__ import print_function

import pandas
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.tsa.stattools as smtsa

import matplotlib.pyplot as pyplot

import thinkplot
import thinkstats2
import regression

FORMATS = ['png']



In [4]:

    
transactions = pandas.read_csv('mj-clean.csv', parse_dates=[5])



In [5]:

    
transactions.columns









    Out[5]:





Index([u'city', u'state', u'price', u'amount', u'quality', u'date', u'ppg',
       u'state.name', u'lat', u'lon'],
      dtype='object')



In [6]:

    
transactions.head()









    Out[6]:






  
    
      
      city
      state
      price
      amount
      quality
      date
      ppg
      state.name
      lat
      lon
    
  
  
    
      0
      Annandale
      VA
      100
      7.075
      high
      2010-09-02
      14.13
      Virginia
      38.830345
      -77.213870
    
    
      1
      Auburn
      AL
      60
      28.300
      high
      2010-09-02
      2.12
      Alabama
      32.578185
      -85.472820
    
    
      2
      Austin
      TX
      60
      28.300
      medium
      2010-09-02
      2.12
      Texas
      30.326374
      -97.771258
    
    
      3
      Belleville
      IL
      400
      28.300
      high
      2010-09-02
      14.13
      Illinois
      38.532311
      -89.983521
    
    
      4
      Boone
      NC
      55
      3.540
      high
      2010-09-02
      15.54
      North Carolina
      36.217052
      -81.687983



In [7]:

    
transactions.shape









    Out[7]:





(147070, 10)



In [8]:

    
def GroupByQualityAndDay(transactions):
    groups = transactions.groupby('quality')
    dailies = {}
    for name, group in groups:
        dailies[name] = GroupByDay(group)
        
    return dailies

def GroupByDay(transactions, func=np.mean):
    grouped = transactions[['date', 'ppg']].groupby('date')
    daily = grouped.aggregate(func)
    
    daily['date'] = daily.index
    start = daily.date[0]
    one_year = np.timedelta64(1, 'Y')
    daily['years'] = (daily.date - start) / one_year
    
    return daily



In [9]:

    
dailies = GroupByQualityAndDay(transactions)
dailies









    Out[9]:





{'high':                   ppg       date     years
 date                                      
 2010-09-02  13.384186 2010-09-02  0.000000
 2010-09-03  14.459588 2010-09-03  0.002738
 2010-09-04  14.923333 2010-09-04  0.005476
 2010-09-05  16.667500 2010-09-05  0.008214
 2010-09-06  15.537500 2010-09-06  0.010952
 2010-09-07  12.715000 2010-09-07  0.013690
 2010-09-08  13.865000 2010-09-08  0.016427
 2010-09-11  15.540000 2010-09-11  0.024641
 2010-09-12  11.310000 2010-09-12  0.027379
 2010-09-13  13.300000 2010-09-13  0.030117
 2010-09-14  13.670207 2010-09-14  0.032855
 2010-09-15  13.510101 2010-09-15  0.035593
 2010-09-16  13.483694 2010-09-16  0.038331
 2010-09-17  13.234431 2010-09-17  0.041069
 2010-09-18  13.656053 2010-09-18  0.043807
 2010-09-19  13.915778 2010-09-19  0.046544
 2010-09-20  13.769322 2010-09-20  0.049282
 2010-09-21  13.578381 2010-09-21  0.052020
 2010-09-22  13.563624 2010-09-22  0.054758
 2010-09-23  13.644103 2010-09-23  0.057496
 2010-09-24  13.050305 2010-09-24  0.060234
 2010-09-25  12.906000 2010-09-25  0.062972
 2010-09-26  13.819722 2010-09-26  0.065710
 2010-09-27  14.509123 2010-09-27  0.068448
 2010-09-28  12.676905 2010-09-28  0.071186
 2010-09-29  13.755075 2010-09-29  0.073923
 2010-09-30  13.556000 2010-09-30  0.076661
 2010-10-01  13.169030 2010-10-01  0.079399
 2010-10-02  13.002157 2010-10-02  0.082137
 2010-10-03  13.541899 2010-10-03  0.084875
 ...               ...        ...       ...
 2014-04-14  11.124048 2014-04-14  3.614037
 2014-04-15  10.552581 2014-04-15  3.616775
 2014-04-16  10.332857 2014-04-16  3.619513
 2014-04-17  11.031607 2014-04-17  3.622251
 2014-04-18  11.269863 2014-04-18  3.624989
 2014-04-19  11.841000 2014-04-19  3.627727
 2014-04-20  10.712308 2014-04-20  3.630465
 2014-04-21  10.984795 2014-04-21  3.633203
 2014-04-22  10.760217 2014-04-22  3.635941
 2014-04-23  10.920000 2014-04-23  3.638678
 2014-04-24  10.250833 2014-04-24  3.641416
 2014-04-25  10.482462 2014-04-25  3.644154
 2014-04-26  11.379254 2014-04-26  3.646892
 2014-04-27  11.482391 2014-04-27  3.649630
 2014-04-28  10.784516 2014-04-28  3.652368
 2014-04-29  11.750556 2014-04-29  3.655106
 2014-04-30  11.212727 2014-04-30  3.657844
 2014-05-01  10.286923 2014-05-01  3.660582
 2014-05-02  12.260000 2014-05-02  3.663320
 2014-05-03   9.942941 2014-05-03  3.666057
 2014-05-04  10.552459 2014-05-04  3.668795
 2014-05-05  11.290244 2014-05-05  3.671533
 2014-05-06  11.199710 2014-05-06  3.674271
 2014-05-07  12.064058 2014-05-07  3.677009
 2014-05-08   9.953333 2014-05-08  3.679747
 2014-05-09  11.468298 2014-05-09  3.682485
 2014-05-10  10.532326 2014-05-10  3.685223
 2014-05-11  11.518750 2014-05-11  3.687961
 2014-05-12  10.578293 2014-05-12  3.690699
 2014-05-13   9.604615 2014-05-13  3.693437
 
 [1241 rows x 3 columns], 'low':                   ppg       date     years
 date                                      
 2010-09-02   4.943750 2010-09-02  0.000000
 2010-09-03   3.984138 2010-09-03  0.002738
 2010-09-04   3.530000 2010-09-04  0.005476
 2010-09-10   4.240000 2010-09-10  0.021903
 2010-09-14   6.066118 2010-09-14  0.032855
 2010-09-15   5.326774 2010-09-15  0.035593
 2010-09-16   4.540000 2010-09-16  0.038331
 2010-09-17   6.365000 2010-09-17  0.041069
 2010-09-18   5.415000 2010-09-18  0.043807
 2010-09-19   5.776667 2010-09-19  0.046544
 2010-09-20   5.336250 2010-09-20  0.049282
 2010-09-21   4.589412 2010-09-21  0.052020
 2010-09-22   3.995833 2010-09-22  0.054758
 2010-09-23   3.838636 2010-09-23  0.057496
 2010-09-24   4.886154 2010-09-24  0.060234
 2010-09-25   3.665000 2010-09-25  0.062972
 2010-09-26   6.463846 2010-09-26  0.065710
 2010-09-27   5.039474 2010-09-27  0.068448
 2010-09-28   3.205714 2010-09-28  0.071186
 2010-09-29   4.978148 2010-09-29  0.073923
 2010-09-30   5.621250 2010-09-30  0.076661
 2010-10-01   4.352857 2010-10-01  0.079399
 2010-10-02   4.425385 2010-10-02  0.082137
 2010-10-03   5.206429 2010-10-03  0.084875
 2010-10-04   3.285000 2010-10-04  0.087613
 2010-10-05   3.390000 2010-10-05  0.090351
 2010-10-06   4.976364 2010-10-06  0.093089
 2010-10-07   5.376667 2010-10-07  0.095827
 2010-10-08   3.332000 2010-10-08  0.098565
 2010-10-09   3.416667 2010-10-09  0.101303
 ...               ...        ...       ...
 2014-04-12   7.415714 2014-04-12  3.608561
 2014-04-13   8.825000 2014-04-13  3.611299
 2014-04-14   8.023333 2014-04-14  3.614037
 2014-04-15   8.201667 2014-04-15  3.616775
 2014-04-16   6.090000 2014-04-16  3.619513
 2014-04-17   4.003333 2014-04-17  3.622251
 2014-04-18   3.533333 2014-04-18  3.624989
 2014-04-19   4.950000 2014-04-19  3.627727
 2014-04-20   6.355000 2014-04-20  3.630465
 2014-04-21   6.356667 2014-04-21  3.633203
 2014-04-22   4.210000 2014-04-22  3.635941
 2014-04-23   8.751111 2014-04-23  3.638678
 2014-04-25   4.695000 2014-04-25  3.644154
 2014-04-26   3.632500 2014-04-26  3.646892
 2014-04-27   3.530000 2014-04-27  3.649630
 2014-04-28   5.120000 2014-04-28  3.652368
 2014-04-29   8.300000 2014-04-29  3.655106
 2014-04-30   7.272857 2014-04-30  3.657844
 2014-05-01   4.577143 2014-05-01  3.660582
 2014-05-02   2.943333 2014-05-02  3.663320
 2014-05-03   4.006667 2014-05-03  3.666057
 2014-05-04   4.300000 2014-05-04  3.668795
 2014-05-05  12.488000 2014-05-05  3.671533
 2014-05-06   6.886667 2014-05-06  3.674271
 2014-05-07   5.533750 2014-05-07  3.677009
 2014-05-09  14.700000 2014-05-09  3.682485
 2014-05-10  10.332857 2014-05-10  3.685223
 2014-05-11   3.336667 2014-05-11  3.687961
 2014-05-12   3.270000 2014-05-12  3.690699
 2014-05-13   4.447778 2014-05-13  3.693437
 
 [1179 rows x 3 columns], 'medium':                   ppg       date     years
 date                                      
 2010-09-02  11.021250 2010-09-02  0.000000
 2010-09-03   8.055000 2010-09-03  0.002738
 2010-09-04  16.950000 2010-09-04  0.005476
 2010-09-05  12.005000 2010-09-05  0.008214
 2010-09-08  16.250000 2010-09-08  0.016427
 2010-09-09  12.710000 2010-09-09  0.019165
 2010-09-13  14.120000 2010-09-13  0.030117
 2010-09-14   9.949200 2010-09-14  0.032855
 2010-09-15   9.211096 2010-09-15  0.035593
 2010-09-16   8.737500 2010-09-16  0.038331
 2010-09-17   9.886552 2010-09-17  0.041069
 2010-09-18   9.733684 2010-09-18  0.043807
 2010-09-19   7.992121 2010-09-19  0.046544
 2010-09-20   9.554573 2010-09-20  0.049282
 2010-09-21   8.945181 2010-09-21  0.052020
 2010-09-22   8.804537 2010-09-22  0.054758
 2010-09-23   7.706806 2010-09-23  0.057496
 2010-09-24   7.898730 2010-09-24  0.060234
 2010-09-25   7.865417 2010-09-25  0.062972
 2010-09-26   8.874048 2010-09-26  0.065710
 2010-09-27   9.020784 2010-09-27  0.068448
 2010-09-28   7.711000 2010-09-28  0.071186
 2010-09-29   8.369524 2010-09-29  0.073923
 2010-09-30   9.128571 2010-09-30  0.076661
 2010-10-01   9.108462 2010-10-01  0.079399
 2010-10-02   8.680227 2010-10-02  0.082137
 2010-10-03   9.089189 2010-10-03  0.084875
 2010-10-04   9.648000 2010-10-04  0.087613
 2010-10-05   8.973333 2010-10-05  0.090351
 2010-10-06   7.558636 2010-10-06  0.093089
 ...               ...        ...       ...
 2014-04-14  10.033774 2014-04-14  3.614037
 2014-04-15  10.271538 2014-04-15  3.616775
 2014-04-16  10.829344 2014-04-16  3.619513
 2014-04-17   9.602462 2014-04-17  3.622251
 2014-04-18   9.746000 2014-04-18  3.624989
 2014-04-19   9.844737 2014-04-19  3.627727
 2014-04-20   9.030247 2014-04-20  3.630465
 2014-04-21   9.395125 2014-04-21  3.633203
 2014-04-22   9.513750 2014-04-22  3.635941
 2014-04-23   9.406184 2014-04-23  3.638678
 2014-04-24  12.764545 2014-04-24  3.641416
 2014-04-25   9.389298 2014-04-25  3.644154
 2014-04-26  10.209211 2014-04-26  3.646892
 2014-04-27   8.751429 2014-04-27  3.649630
 2014-04-28   8.889753 2014-04-28  3.652368
 2014-04-29   7.430980 2014-04-29  3.655106
 2014-04-30   9.658585 2014-04-30  3.657844
 2014-05-01   9.538864 2014-05-01  3.660582
 2014-05-02   8.515106 2014-05-02  3.663320
 2014-05-03   9.005000 2014-05-03  3.666057
 2014-05-04   8.931571 2014-05-04  3.668795
 2014-05-05   9.167670 2014-05-05  3.671533
 2014-05-06   9.262632 2014-05-06  3.674271
 2014-05-07   8.918585 2014-05-07  3.677009
 2014-05-08   7.928889 2014-05-08  3.679747
 2014-05-09   9.037059 2014-05-09  3.682485
 2014-05-10   9.155686 2014-05-10  3.685223
 2014-05-11   8.138214 2014-05-11  3.687961
 2014-05-12   8.959655 2014-05-12  3.690699
 2014-05-13   9.810625 2014-05-13  3.693437
 
 [1238 rows x 3 columns]}



In [8]:

    
thinkplot.PrePlot(rows=3)
for i, (name, daily) in enumerate(dailies.items()):
    thinkplot.SubPlot(i+1)
    title = 'price per gram ($)' if i == 0 else ''
    thinkplot.Config(ylim=[0,20], title=title)
    thinkplot.Scatter(daily.index, daily.ppg, s=10, label=name)
    if i == 2:
        pyplot.xticks(rotation=30)
    else:
        thinkplot.Config(xticks=[])









    



/Users/nathankiner/anaconda/lib/python2.7/site-packages/matplotlib/axes/_axes.py:475: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "
/Users/nathankiner/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [9]:

    
def RunLinearModel(daily):
    model = smf.ols('ppg ~ years', data=daily)
    results = model.fit()
    return model, results



In [11]:

    
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
    thinkplot.SubPlot(i)
    model, results = RunLinearModel(daily)
    PlotFittedValues(model, results, label=name)
    i += 1



In [10]:

    
def PlotFittedValues(model, results, label=''):
    years = model.exog[:,1]
    values = model.endog
    thinkplot.Scatter(years, values, s=15, label=label)
    thinkplot.Plot(years, results.fittedvalues, label='model')



In [12]:

    
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
    thinkplot.SubPlot(i)
    dates = pandas.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)
    ewma = pandas.ewma(reindexed.ppg, span=30)
    res = (reindexed.ppg - ewma).dropna()
    fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
    reindexed.ppg.fillna(fake_data, inplace=True)
    thinkplot.Scatter(reindexed.index, reindexed.ppg, s=15, label=name)
    thinkplot.Plot(ewma.index, ewma)
    i += 1



In [13]:

    
def SerialCorr(series, lag=1):
    xs = series[lag:]
    ys = series.shift(lag)[lag:]
    corr = thinkstats2.Corr(xs, ys)
    return corr



In [14]:

    
lag_corr = {}
nlags = 365
thinkplot.PrePlot(rows = 3)
i = 1
for name, daily in dailies.items():
    dates = pandas.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)
    
    ewma = pandas.ewma(reindexed.ppg, span=30)
    res = (reindexed.ppg - ewma).dropna()
    fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
    reindexed.ppg.fillna(fake_data, inplace=True)
    
    rolling_mean = pandas.rolling_mean(reindexed.ppg, 30)
    ewma = pandas.ewma(reindexed.ppg, span=30)
    res = (reindexed.ppg - ewma).dropna()
    
    lag_corr[name] = []
    acf = smtsa.acf(res, nlags=nlags, unbiased=True)
    thinkplot.SubPlot(i)
    SimulateAutocorrelation(daily, nlags=nlags)
    thinkplot.Plot(acf, label=name)
    i += 1
    thinkplot.Show(axis=[0,nlags,-.1,.1])









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-fd31791312c0> in <module>()
     19     acf = smtsa.acf(res, nlags=nlags, unbiased=True)
     20     thinkplot.SubPlot(i)
---> 21     SimulateAutocorrelation(daily, nlags=nlags)
     22     thinkplot.Plot(acf, label=name)
     23     i += 1

NameError: name 'SimulateAutocorrelation' is not defined



In [15]:

    
def SimulateAutocorrelation(daily, iters=1001, nlags=40):
    """Resample residuals, compute autocorrelation, and plot percentiles.

    daily: DataFrame
    iters: number of simulations to run
    nlags: maximum lags to compute autocorrelation
    """
    # run simulations
    t = []
    for _ in range(iters):
        filled = FillMissing(daily, span=30)
        resid = thinkstats2.Resample(filled.resid)
        acf = smtsa.acf(resid, nlags=nlags, unbiased=True)[1:]
        t.append(np.abs(acf))

    high = thinkstats2.PercentileRows(t, [97.5])[0]
    low = -high
    lags = range(1, nlags+1)
    thinkplot.FillBetween(lags, low, high, alpha=0.2, color='gray')



In [16]:

    
def FillMissing(daily, span=30):
    """Fills missing values with an exponentially weighted moving average.

    Resulting DataFrame has new columns 'ewma' and 'resid'.

    daily: DataFrame of daily prices
    span: window size (sort of) passed to ewma

    returns: new DataFrame of daily prices
    """
    dates = pandas.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)

    ewma = pandas.ewma(reindexed.ppg, span=span)

    resid = (reindexed.ppg - ewma).dropna()
    fake_data = ewma + thinkstats2.Resample(resid, len(reindexed))
    reindexed.ppg.fillna(fake_data, inplace=True)

    reindexed['ewma'] = ewma
    reindexed['resid'] = reindexed.ppg - ewma
    return reindexed



In [17]:

    
def AddWeeklySeasonality(daily, inc=2):
    frisat = (daily.index.dayofweek == 4) | (daily.index.dayofweek == 5)
    fake = daily.copy()
    fake.ppg[frisat] += np.random.uniform(0, inc, frisat.sum())
    return fake



In [83]:

    
lag_corr = {}
nlags = 40
thinkplot.PrePlot(rows = 3)
i = 1
for name, daily in dailies.items():
    fdaily = AddWeeklySeasonality(daily, 2)
    dates = pandas.date_range(fdaily.index.min(), fdaily.index.max())
    reindexed = fdaily.reindex(dates)
    
    ewma = pandas.ewma(reindexed.ppg, span=30)
    res = (reindexed.ppg - ewma).dropna()
    fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
    reindexed.ppg.fillna(fake_data, inplace=True)
    
    rolling_mean = pandas.rolling_mean(reindexed.ppg, 30)
    ewma = pandas.ewma(reindexed.ppg, span=30)
    res = (reindexed.ppg - ewma).dropna()
    
    lag_corr[name] = []
    acf = smtsa.acf(res, nlags=nlags, unbiased=True)
    thinkplot.SubPlot(i)
    SimulateAutocorrelation(fdaily, nlags=nlags)
    thinkplot.Plot(acf, label=name)
    i += 1
    thinkplot.Show(axis=[0,nlags,-.1,.1])









    



/Users/nathankiner/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    












    












    












    





<matplotlib.figure.Figure at 0x108cad350>



In [18]:

    
def GenerateSimplePrediction(results, years):
    """Generates a simple prediction.

    results: results object
    years: sequence of times (in years) to make predictions for

    returns: sequence of predicted values
    """
    n = len(years)
    inter = np.ones(n)
    d = dict(Intercept=inter, years=years)
    predict_df = pandas.DataFrame(d)
    predict = results.predict(predict_df)
    return predict



In [19]:

    
def RunQuadraticModel(daily):
    daily["years2"] = daily.years**2
    model = smf.ols('ppg ~ years + years2', data=daily)
    results = model.fit()
    return model, results



In [38]:

    
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
    thinkplot.SubPlot(i)
    model, results = RunPolyModel(daily,2)
    print(results.summary())
    PlotFittedValues(model, results, label=name)
    i += 1









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    ppg   R-squared:                       0.459
Model:                            OLS   Adj. R-squared:                  0.458
Method:                 Least Squares   F-statistic:                     349.6
Date:                Mon, 23 Nov 2015   Prob (F-statistic):          2.16e-164
Time:                        13:13:10   Log-Likelihood:                -1493.4
No. Observations:                1241   AIC:                             2995.
Df Residuals:                    1237   BIC:                             3015.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     13.5302      0.089    152.075      0.000        13.356    13.705
years         -0.5741      0.209     -2.746      0.006        -0.984    -0.164
years2        -0.2538      0.131     -1.931      0.054        -0.512     0.004
years3         0.0661      0.023      2.833      0.005         0.020     0.112
==============================================================================
Omnibus:                       47.069   Durbin-Watson:                   1.897
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              114.558
Skew:                           0.159   Prob(JB):                     1.33e-25
Kurtosis:                       4.454   Cond. No.                         206.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    ppg   R-squared:                       0.085
Model:                            OLS   Adj. R-squared:                  0.083
Method:                 Least Squares   F-statistic:                     38.26
Date:                Mon, 23 Nov 2015   Prob (F-statistic):           1.21e-23
Time:                        13:13:10   Log-Likelihood:                -2030.5
No. Observations:                1238   AIC:                             4069.
Df Residuals:                    1234   BIC:                             4089.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      8.3927      0.140     59.916      0.000         8.118     8.668
years          1.0136      0.328      3.093      0.002         0.371     1.657
years2        -0.1488      0.205     -0.724      0.469        -0.552     0.254
years3        -0.0161      0.036     -0.444      0.657        -0.087     0.055
==============================================================================
Omnibus:                      192.935   Durbin-Watson:                   1.835
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1092.864
Skew:                           0.586   Prob(JB):                    4.87e-238
Kurtosis:                       7.451   Cond. No.                         209.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    ppg   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     15.29
Date:                Mon, 23 Nov 2015   Prob (F-statistic):           9.22e-10
Time:                        13:13:10   Log-Likelihood:                -3086.5
No. Observations:                1179   AIC:                             6181.
Df Residuals:                    1175   BIC:                             6201.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      4.8158      0.388     12.399      0.000         4.054     5.578
years          1.2808      0.905      1.415      0.157        -0.495     3.056
years2        -0.0579      0.565     -0.103      0.918        -1.166     1.050
years3        -0.0422      0.100     -0.424      0.672        -0.238     0.153
==============================================================================
Omnibus:                      664.372   Durbin-Watson:                   1.835
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6790.976
Skew:                           2.428   Prob(JB):                         0.00
Kurtosis:                      13.708   Cond. No.                         213.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [29]:

    
def RunPolyModel(daily, pow=1):
    formula = 'ppg ~ years'
    if pow > 1:
        for i in xrange(2,pow + 1):
            name = "years" + str(i)
            daily[name] = daily.years**i
            formula += ' + ' + name
    model = smf.ols(formula, data=daily)
    results = model.fit()
    return model, results



In [39]:

    
def GenerateSimplePrediction(results, years):
    """Generates a simple prediction.

    results: results object
    years: sequence of times (in years) to make predictions for

    returns: sequence of predicted values
    """
    n = len(years)
    inter = np.ones(n)
    d = dict(Intercept=inter, years=years, years2=years**2)
    print(d)
    # predict_df = pandas.DataFrame(d)
    # predict = results.predict(predict_df)
    # print(predict)
    # return predict



In [41]:

    
GenerateSimplePrediction(results, [1])









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-41-83ce51493ef6> in <module>()
----> 1 GenerateSimplePrediction(results, [1])

<ipython-input-39-c5673557e900> in GenerateSimplePrediction(results, years)
      9     n = len(years)
     10     inter = np.ones(n)
---> 11     d = dict(Intercept=inter, years=years, years2=years**2)
     12     print(d)
     13     # predict_df = pandas.DataFrame(d)

TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'



In [58]:

    
def GeneratePredictions(result_seq, years, add_resid=False):
    """Generates an array of predicted values from a list of model results.

    When add_resid is False, predictions represent sampling error only.

    When add_resid is True, they also include residual error (which is
    more relevant to prediction).
    
    result_seq: list of model results
    years: sequence of times (in years) to make predictions for
    add_resid: boolean, whether to add in resampled residuals

    returns: sequence of predictions
    """
    n = len(years)
    d = dict(Intercept=np.ones(n), years=years, years2=years**2)
    predict_df = pandas.DataFrame(d)
    
    predict_seq = []
    for fake_results in result_seq:
        predict = fake_results.predict(predict_df)
        if add_resid:
            predict += thinkstats2.Resample(fake_results.resid, n)
        predict_seq.append(predict)

    return predict_seq



In [48]:

    
def SimulateResults(daily, iters=101, func=RunLinearModel):
    """Run simulations based on resampling residuals.

    daily: DataFrame of daily prices
    iters: number of simulations
    func: function that fits a model to the data

    returns: list of result objects
    """
    _, results = func(daily)
    fake = daily.copy()
    
    result_seq = []
    for _ in range(iters):
        fake.ppg = results.fittedvalues + thinkstats2.Resample(results.resid)
        _, fake_results = func(fake)
        result_seq.append(fake_results)

    return result_seq



In [49]:

    
def PlotPredictions(daily, years, iters=101, percent=90, func=RunLinearModel):
    """Plots predictions.

    daily: DataFrame of daily prices
    years: sequence of times (in years) to make predictions for
    iters: number of simulations
    percent: what percentile range to show
    func: function that fits a model to the data
    """
    result_seq = SimulateResults(daily, iters=iters, func=func)
    p = (100 - percent) / 2
    percents = p, 100-p

    predict_seq = GeneratePredictions(result_seq, years, add_resid=True)
    low, high = thinkstats2.PercentileRows(predict_seq, percents)
    thinkplot.FillBetween(years, low, high, alpha=0.3, color='gray')

    predict_seq = GeneratePredictions(result_seq, years, add_resid=False)
    low, high = thinkstats2.PercentileRows(predict_seq, percents)
    thinkplot.FillBetween(years, low, high, alpha=0.5, color='gray')



In [59]:

    
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
    thinkplot.SubPlot(i)
    start = daily.years.min()
    PlotPredictions(daily=daily, years=np.arange(start, 5, 0.01), func=RunQuadraticModel)
    thinkplot.Scatter(daily.years, daily.ppg)
    i += 1



In [84]:

    
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
    thinkplot.SubPlot(i)
    
    dates = pandas.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)
    
    ewma = pandas.ewma(reindexed.ppg, span=30)
    res = (reindexed.ppg - ewma).dropna()
    fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
    reindexed.ppg.fillna(fake_data, inplace=True)
    reindexed.date = reindexed.index
    
    start = daily.date[0]
    one_year = np.timedelta64(1, 'Y')
    reindexed.years = (reindexed.date - start) / one_year
    
    start = reindexed.years.min()
    PlotPredictions(daily=reindexed, years=np.arange(start, 5, 0.01), func=RunQuadraticModel)
    thinkplot.Scatter(reindexed.years, reindexed.ppg)
    i += 1



In [131]:

    
class SerialCorrelationTest(thinkstats2.HypothesisTest):
    """Tests serial correlations by permutation."""

    def __init__(self, data, lag):
        self.data = ReindexData(data)
        self.lag = lag
        self.actual = self.TestStatistic(self.data)
    
    def TestStatistic(self, data):
        """Computes the test statistic.

        data: tuple of xs and ys
        """
        test_stat = abs(thinkstats2.SerialCorr(data.ppg, self.lag))
        return test_stat

    def RunModel(self):
        """Run the model of the null hypothesis.

        returns: simulated data
        """
        test_data = self.data.copy()
        test_ppg = thinkstats2.Resample(self.data.ppg)
        test_data.ppg = test_ppg
        return test_data



In [14]:

    
def ReindexData(daily):
    dates = pandas.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)
    
    ewma = pandas.ewma(reindexed.ppg, span=30)
    res = (reindexed.ppg - ewma).dropna()
    fake_data = ewma + thinkstats2.Resample(res, len(reindexed))
    reindexed.ppg.fillna(fake_data, inplace=True)
    return reindexed



In [134]:

    
for name, daily in dailies.items():
    sct = SerialCorrelationTest(daily, 1)
    print(name, '{0:.5f}'.format(sct.PValue(iters=5000)))









    



high 0.00000
medium 0.00000
low 0.00000



In [135]:

    
class SerialCorrelationTest2(thinkstats2.HypothesisTest):
    """Tests serial correlations by permutation."""

    def TestStatistic(self, data):
        """Computes the test statistic.

        data: tuple of xs and ys
        """
        series, lag = data
        test_stat = abs(thinkstats2.SerialCorr(series, lag))
        return test_stat

    def RunModel(self):
        """Run the model of the null hypothesis.

        returns: simulated data
        """
        series, lag = self.data
        permutation = series.reindex(np.random.permutation(series.index))
        return permutation, lag



In [118]:

    
for name, daily in dailies.items():
    sct = SerialCorrelationTest2((daily.ppg, 1))
    print(name, '{0:.3f}'.format(sct.PValue(iters=5000)))









    



high 0.000
medium 0.000
low 0.000



In [12]:

    
def EWMAPredict(daily, date):
    results = ReindexData(daily.copy())
    
    date_ranges = pandas.date_range(results.index.max(), date)
    for day in date_ranges:
        ewma = pandas.ewma(results.ppg, span=30)
        inter = ewma[-1]
        ewma_slope = pandas.ewma(results.ppg.diff(), span=30).dropna()
        slope = ewma_slope[-1]
        
        results.reindex(pandas.date_range(results.index.min(), day))
        results.ppg[-1] = inter + slope
        
    results = results[daily.index.max():]
    return results



In [15]:

    
thinkplot.PrePlot(rows=3)
i = 1
for name, daily in dailies.items():
    thinkplot.SubPlot(i)
    results = EWMAPredict(daily, '2015-01-01')
    thinkplot.Scatter(daily.index, daily.ppg)
    thinkplot.Plot(results.index, results.ppg)
    i += 1









    



/Users/nathankiner/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/nathankiner/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [10]:

    
for name, daily in dailies.items():
    print(daily.index.max())









    



2014-05-13 00:00:00
2014-05-13 00:00:00
2014-05-13 00:00:00



In [ ]:

	city	state	price	amount	quality	date	ppg	state.name	lat	lon
0	Annandale	VA	100	7.075	high	2010-09-02	14.13	Virginia	38.830345	-77.213870
1	Auburn	AL	60	28.300	high	2010-09-02	2.12	Alabama	32.578185	-85.472820
2	Austin	TX	60	28.300	medium	2010-09-02	2.12	Texas	30.326374	-97.771258
3	Belleville	IL	400	28.300	high	2010-09-02	14.13	Illinois	38.532311	-89.983521
4	Boone	NC	55	3.540	high	2010-09-02	15.54	North Carolina	36.217052	-81.687983