notebook.community

Edit and run



In [79]:

    
series = np.arange(10)
series









    Out[79]:





array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])



In [80]:

    
series = np.arange(10)
pandas.rolling_mean(series, 3)









    Out[80]:





array([ nan,  nan,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.])



In [231]:

    
import pandas
import thinkplot
import thinkstats2
import datetime
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
import statsmodels.tsa.stattools as smtsa
import timeseries
import scipy.signal as sig

transactions = timeseries.ReadData()
dailies = timeseries.GroupByQualityAndDay(transactions)
name = 'high'
daily = dailies[name]
thinkplot.Scatter(daily.index, daily.ppg)

series = daily.ppg.diff()[1:]
permutation = series.reindex(np.random.permutation(series.index))
thinkstats2.SerialCorr(series, 1), thinkstats2.SerialCorr(permutation, 1)









    Out[231]:





(-0.48213153529447333, -0.0022153087890049057)



In [206]:

    
starts = np.linspace(0, len(daily), 101).astype(int)
for start in starts[:-2]:
    fake = daily[start:]
    _, fake_results = timeseries.RunLinearModel(fake)
fake









    Out[206]:






  
    
      
      ppg
      date
      years
    
    
      date
      
      
      
    
  
  
    
      2014-04-19
       11.841000
      2014-04-19
       3.627727
    
    
      2014-04-20
       10.712308
      2014-04-20
       3.630465
    
    
      2014-04-21
       10.984795
      2014-04-21
       3.633203
    
    
      2014-04-22
       10.760217
      2014-04-22
       3.635941
    
    
      2014-04-23
       10.920000
      2014-04-23
       3.638678
    
    
      2014-04-24
       10.250833
      2014-04-24
       3.641416
    
    
      2014-04-25
       10.482462
      2014-04-25
       3.644154
    
    
      2014-04-26
       11.379254
      2014-04-26
       3.646892
    
    
      2014-04-27
       11.482391
      2014-04-27
       3.649630
    
    
      2014-04-28
       10.784516
      2014-04-28
       3.652368
    
    
      2014-04-29
       11.750556
      2014-04-29
       3.655106
    
    
      2014-04-30
       11.212727
      2014-04-30
       3.657844
    
    
      2014-05-01
       10.286923
      2014-05-01
       3.660582
    
    
      2014-05-02
       12.260000
      2014-05-02
       3.663320
    
    
      2014-05-03
        9.942941
      2014-05-03
       3.666057
    
    
      2014-05-04
       10.552459
      2014-05-04
       3.668795
    
    
      2014-05-05
       11.290244
      2014-05-05
       3.671533
    
    
      2014-05-06
       11.199710
      2014-05-06
       3.674271
    
    
      2014-05-07
       12.064058
      2014-05-07
       3.677009
    
    
      2014-05-08
        9.953333
      2014-05-08
       3.679747
    
    
      2014-05-09
       11.468298
      2014-05-09
       3.682485
    
    
      2014-05-10
       10.532326
      2014-05-10
       3.685223
    
    
      2014-05-11
       11.518750
      2014-05-11
       3.687961
    
    
      2014-05-12
       10.578293
      2014-05-12
       3.690699
    
    
      2014-05-13
        9.604615
      2014-05-13
       3.693437
    
  

25 rows × 3 columns



In [82]:

    
dates = pandas.date_range(daily.index.min(), daily.index.max())
daily_missing = daily.reindex(dates)
daily_missing.ppg.isnull().sum()









    Out[82]:





109



In [83]:

    
roll_mean = pandas.rolling_mean(daily_missing.ppg, 30, min_periods=5)
thinkplot.Scatter(roll_mean.index, roll_mean)
roll_mean.isnull().sum()









    Out[83]:





85



In [84]:

    
ma = pandas.ewma(daily_missing.ppg, span=30)
thinkplot.Scatter(ma.index, roll_mean)
ma.isnull().sum()









    Out[84]:





0



In [85]:

    
trend = pandas.ewma(daily_missing.ppg.diff(), 180)
thinkplot.Scatter(trend.index, trend)
trend









    Out[85]:





2010-09-02         NaN
2010-09-03    1.075402
2010-09-04    0.768726
2010-09-05    1.095676
2010-09-06    0.534625
2010-09-07   -0.144260
2010-09-08    0.074448
2010-09-09    0.074448
2010-09-10    0.074448
2010-09-11    0.074448
2010-09-12   -0.550740
2010-09-13   -0.226955
2010-09-14   -0.159124
2010-09-15   -0.159225
2010-09-16   -0.146813
...
2014-04-29   -0.014516
2014-04-30   -0.017411
2014-05-01   -0.022435
2014-05-02   -0.011398
2014-05-03   -0.024150
2014-05-04   -0.020646
2014-05-05   -0.016451
2014-05-06   -0.016860
2014-05-07   -0.011986
2014-05-08   -0.023594
2014-05-09   -0.015085
2014-05-10   -0.020178
2014-05-11   -0.014611
2014-05-12   -0.019731
2014-05-13   -0.025007
Freq: D, Length: 1350



In [86]:

    
(232-109)/15.0









    Out[86]:





8.2



In [87]:

    
roll_mean2 = roll_mean.interpolate(method='time')
roll_mean2 = roll_mean2.fillna(method='backfill')
thinkplot.Scatter(roll_mean2.index, roll_mean2)
roll_mean2.isnull().sum()









    Out[87]:





0



In [88]:

    
resid = daily_missing.ppg - roll_mean2
roll_resid = resid.dropna()
thinkplot.Scatter(roll_resid.index, roll_resid)
roll_resid.isnull().sum()









    Out[88]:





0



In [89]:

    
daily_missing['resid'] = daily_missing.ppg - roll_mean2
roll_resid = daily_missing.resid.dropna()
thinkplot.Scatter(roll_resid.index, roll_resid)
roll_resid.isnull().sum()









    Out[89]:





0



In [90]:

    
fake_resid = np.random.choice(roll_resid, len(daily_missing), replace=True)
daily_missing.ppg = daily_missing.ppg.fillna(roll_mean2 + fake_resid)
thinkplot.Scatter(daily_missing.index, daily_missing.ppg)
daily_missing.isnull().sum()









    Out[90]:





ppg        0
date     109
years    109
resid    109
dtype: int64



In [154]:

    
thinkstats2.RandomSeed(17)
filled = timeseries.FillMissing(daily)
sum(filled.ppg.isnull())
lag=92
xs = filled.resid[lag:]
ys = filled.resid.shift(lag)[lag:]
thinkstats2.Corr(xs, ys)









    Out[154]:





-0.046091270094984337



In [155]:

    
df = pandas.DataFrame(dict(xs=xs, ys=ys))
df.corr()['xs']['ys']









    Out[155]:





-0.046091270094984253



In [161]:

    
acf = smtsa.acf(filled.resid, nlags=100, unbiased=True)
#thinkplot.Plot(acf)
acf[0], acf[1], acf[7], acf[30]









    Out[161]:





(1.0, -0.021766740358017199, 0.0025113249577623058, -0.0061660280108939121)



In [160]:

    
acf = smtsa.acf(filled.resid, nlags=100, unbiased=False)
#thinkplot.Plot(acf)
acf[0], acf[1], acf[7], acf[30]









    Out[160]:





(1.0, -0.02175061684664089, 0.0024983032727961311, -0.0060290051662073817)



In [194]:

    
thinkstats2.RandomSeed(17)
filled = timeseries.FillMissing(daily)
filled['slope'] = pandas.ewma(filled.ppg.diff(), 500)
filled[-1:]

start = filled.index[-1]
inter = filled.ewma[-1]
slope = filled.slope[-1]

dates = pandas.date_range(filled.index.min(), daily.index.max()+np.timedelta64(365, 'D'))
predicted = filled.reindex(dates)

predicted['date'] = predicted.index
one_day = np.timedelta64(1, 'D')
predicted['days'] = (predicted.date - start) / one_day
predict = inter + slope * predicted.days

predicted.ewma.fillna(predict, inplace=True)
thinkplot.Plot(predicted.ewma)



In [12]:

    
daily_missing['ppg1'] = daily_missing.ppg.shift(-1)
daily_missing['ppg7'] = daily_missing.ppg.shift(-7)
daily_missing['ppg30'] = daily_missing.ppg.shift(-30.5)
daily_missing['ppg365'] = daily_missing.ppg.shift(-365)
ppg_mean = pandas.rolling_mean(daily_missing.ppg, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg_mean)

ppg7_mean = pandas.rolling_mean(daily_missing.ppg7, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg7_mean)

ppg30_mean = pandas.rolling_mean(daily_missing.ppg30, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg30_mean)

ppg365_mean = pandas.rolling_mean(daily_missing.ppg365, 30, center=True)
thinkplot.Plot(daily_missing.index, ppg365_mean)

daily_missing[['ppg', 'ppg1', 'ppg7', 'ppg30', 'ppg365']].corr()









    



Warning: Brewer ran out of colors.






    Out[12]:






  
    
      
      ppg
      ppg1
      ppg7
      ppg30
      ppg365
    
  
  
    
      ppg
       1.000000
       0.486785
       0.485610
       0.454358
       0.337646
    
    
      ppg1
       0.486785
       1.000000
       0.466057
       0.466313
       0.294258
    
    
      ppg7
       0.485610
       0.466057
       1.000000
       0.483190
       0.259982
    
    
      ppg30
       0.454358
       0.466313
       0.483190
       1.000000
       0.320931
    
    
      ppg365
       0.337646
       0.294258
       0.259982
       0.320931
       1.000000
    
  

5 rows × 5 columns



In [13]:

    
# TODO: run this analysis on the residuals
daily_missing['resid1'] = daily_missing.resid.shift(-1)
daily_missing['resid7'] = daily_missing.resid.shift(-5)
daily_missing['resid30'] = daily_missing.resid.shift(-30.5)
daily_missing['resid365'] = daily_missing.resid.shift(-365)

resid_mean = pandas.rolling_mean(daily_missing.resid, 30, center=True)
thinkplot.Plot(daily_missing.index, resid_mean)

resid7_mean = pandas.rolling_mean(daily_missing.resid7, 30, center=True)
thinkplot.Plot(daily_missing.index, resid7_mean)

resid30_mean = pandas.rolling_mean(daily_missing.resid30, 30, center=True)
thinkplot.Plot(daily_missing.index, resid30_mean)

resid365_mean = pandas.rolling_mean(daily_missing.resid365, 30, center=True)
thinkplot.Plot(daily_missing.index, resid365_mean)

daily_missing[['resid', 'resid1', 'resid7', 'resid30', 'resid365']].corr()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-13-92f485982338> in <module>()
      1 # TODO: run this analysis on the residuals
----> 2 daily_missing['resid1'] = daily_missing.resid.shift(-1)
      3 daily_missing['resid7'] = daily_missing.resid.shift(-5)
      4 daily_missing['resid30'] = daily_missing.resid.shift(-30.5)
      5 daily_missing['resid365'] = daily_missing.resid.shift(-365)

/home/downey/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
   1813                 return self[name]
   1814             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1815                                  (type(self).__name__, name))
   1816 
   1817     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'resid'



In [30]:

    
model = smf.ols('resid ~ resid365', data=daily_missing)
results = model.fit()
results.summary()









    Out[30]:





OLS Regression Results

  Dep. Variable:           resid         R-squared:             0.005


  Model:                    OLS          Adj. R-squared:        0.004


  Method:              Least Squares     F-statistic:           4.820


  Date:              Sat, 05 Jul 2014    Prob (F-statistic):   0.0284 


  Time:                  10:55:55        Log-Likelihood:      -1199.5


  No. Observations:          985         AIC:                   2403.


  Df Residuals:              983         BIC:                   2413.


  Df Model:                    1                                     




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept      0.0095      0.026      0.364   0.716     -0.042     0.061


  resid365       0.0789      0.036      2.195   0.028      0.008     0.149




  Omnibus:        31.735    Durbin-Watson:         2.020


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     76.140


  Skew:            0.071    Prob(JB):           2.93e-17


  Kurtosis:        4.355    Cond. No.               1.38



In [22]:

    
model = smf.ols('ppg ~ ppg1 + ppg7 + ppg30', data=daily_missing)
results = model.fit()
results.summary()









    Out[22]:





OLS Regression Results

  Dep. Variable:            ppg          R-squared:             0.353 


  Model:                    OLS          Adj. R-squared:        0.351 


  Method:              Least Squares     F-statistic:           239.0 


  Date:              Fri, 04 Jul 2014    Prob (F-statistic):  8.82e-124


  Time:                  19:39:03        Log-Likelihood:      -1709.6 


  No. Observations:         1320         AIC:                   3427. 


  Df Residuals:             1316         BIC:                   3448. 


  Df Model:                    3                                      




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept      3.1390      0.339      9.270   0.000      2.475     3.803


  ppg1           0.2556      0.026      9.649   0.000      0.204     0.308


  ppg7           0.2668      0.027      9.961   0.000      0.214     0.319


  ppg30          0.2214      0.027      8.145   0.000      0.168     0.275




  Omnibus:        71.414    Durbin-Watson:         2.258


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    193.744


  Skew:            0.254    Prob(JB):           8.49e-43


  Kurtosis:        4.807    Cond. No.               294.



In [35]:

    
thinkplot.Scatter(daily.index, daily.ppg, alpha=0.1)
triangle = sig.get_window('triangle', 30)
triangle_mean = triangle.mean()
roll_mean = pandas.rolling_window(daily.ppg, 30, 'triang') / triangle_mean
thinkplot.Plot(daily.index, roll_mean)



In [ ]:



In [24]:

    
model = tsa.arima_model.ARIMA(daily.ppg, (7,1,2))



In [23]:

    
results = model.fit()
results.summary()









    Out[23]:





ARIMA Model Results

  Dep. Variable:        D.ppg         No. Observations:     1240   


  Model:           ARIMA(7, 1, 2)     Log Likelihood      -1487.759


  Method:              css-mle        S.D. of innovations    0.802  


  Date:           Fri, 04 Jul 2014    AIC                 2997.517 


  Time:               15:54:01        BIC                 3053.869 


  Sample:            09-03-2010       HQIC                3018.710 


                   - 05-13-2014                                   




                 coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  const           -0.0022      0.001     -2.994   0.003     -0.004    -0.001


  ar.L1.D.ppg     -0.6489      0.260     -2.499   0.013     -1.158    -0.140


  ar.L2.D.ppg     -0.0144      0.039     -0.365   0.715     -0.091     0.063


  ar.L3.D.ppg     -0.0052      0.040     -0.130   0.896     -0.083     0.073


  ar.L4.D.ppg     -0.0224      0.039     -0.578   0.563     -0.098     0.054


  ar.L5.D.ppg     -0.0256      0.040     -0.643   0.520     -0.103     0.052


  ar.L6.D.ppg     -0.0238      0.039     -0.618   0.537     -0.100     0.052


  ar.L7.D.ppg      0.0100      0.033      0.303   0.762     -0.055     0.075


  ma.L1.D.ppg     -0.3050      0.258     -1.181   0.238     -0.811     0.201


  ma.L2.D.ppg     -0.6419      0.248     -2.585   0.010     -1.129    -0.155


              -1.3999                    -0.4369j                    1.4665                   -0.4519     
              -1.3999                    +0.4369j                    1.4665                    0.4519     
              -0.5930                    -1.7284j                    1.8273                   -0.3026     
              -0.5930                    +1.7284j                    1.8273                    0.3026     
               1.4360                    -1.3843j                    1.9946                   -0.1221     
               1.4360                    +1.3843j                    1.9946                    0.1221     
               3.4940                    -0.0000j                    3.4940                   -0.0000     
               1.0329                    +0.0000j                    1.0329                    0.0000     
              -1.5081                    +0.0000j                    1.5081                    0.5000     

Roots

                  Real           Imaginary           Modulus          Frequency


  AR.1

  AR.2

  AR.3

  AR.4

  AR.5

  AR.6

  AR.7

  MA.1

  MA.2



In [196]:

    
year11 = ppg[121:121+365]
#year11 = year11.reindex(range(0, 365))
year11
year11.date[121], year11.date[121+364]









    Out[196]:





(Timestamp('2011-01-01 00:00:00', tz=None),
 Timestamp('2011-12-31 00:00:00', tz=None))



In [210]:

    
thinkplot.Plot(year11.index, year11.ppg)
roll_mean = pandas.rolling_mean(year11.ppg, 30, center=True)
thinkplot.Plot(year11.index, roll_mean, color='yellow')
year11.date[250], year11.date[350], year11.date[430]









    Out[210]:





(Timestamp('2011-05-10 00:00:00', tz=None),
 Timestamp('2011-08-18 00:00:00', tz=None),
 Timestamp('2011-11-06 00:00:00', tz=None))



In [300]:

    
import scipy.signal as sig
gaussian = sig.get_window(('gaussian', 7.5), 30)
gaussian_mean = gaussian.mean()
gaussian /= gaussian.mean()
thinkplot.Plot(gaussian)

boxcar = sig.get_window('boxcar', 30)
boxcar_mean = boxcar.mean()
boxcar /= boxcar.mean()
thinkplot.Plot(boxcar)

triangle = sig.get_window('triangle', 30)
triangle_mean = triangle.mean()
triangle /= triangle.mean()
thinkplot.Plot(triangle)









    



Warning: Brewer ran out of colors.



In [301]:

    
thinkplot.Plot(year11.index, year11.ppg)
roll_mean = pandas.rolling_window(year11.ppg, 30, 'triang', center=True)
roll_mean /= triangle_mean
thinkplot.Plot(year11.index, roll_mean, color='yellow')
year11.date[250], year11.date[350], year11.date[430]









    Out[301]:





(Timestamp('2011-05-10 00:00:00', tz=None),
 Timestamp('2011-08-18 00:00:00', tz=None),
 Timestamp('2011-11-06 00:00:00', tz=None))



In [303]:

    
thinkplot.Plot(year11.index, year11.ppg)
roll_mean = pandas.rolling_window(year11.ppg, 30, 'gaussian', std=7.5, center=True)
roll_mean /= gaussian_mean
thinkplot.Plot(year11.index, roll_mean, color='yellow')
year11.date[250], year11.date[350], year11.date[430]









    Out[303]:





(Timestamp('2011-05-10 00:00:00', tz=None),
 Timestamp('2011-08-18 00:00:00', tz=None),
 Timestamp('2011-11-06 00:00:00', tz=None))



In [113]:

    
low, high = np.amin(ppg.index.values), np.amax(ppg.index.values)
ppg = ppg.reindex(np.arange(low, high+1))
thinkplot.Plot(ppg.index, ppg.ppg)



In [114]:

    
ppg['years'] = ppg.index / 365.0
model = smf.ols('ppg ~ years', data=ppg)
results = model.fit()
results.summary()









    Out[114]:





OLS Regression Results

  Dep. Variable:            ppg          R-squared:             0.141


  Model:                    OLS          Adj. R-squared:        0.140


  Method:              Least Squares     F-statistic:           203.2


  Date:              Mon, 30 Jun 2014    Prob (F-statistic):  8.17e-43


  Time:                  09:42:45        Log-Likelihood:      -1495.5


  No. Observations:         1243         AIC:                   2995.


  Df Residuals:             1241         BIC:                   3005.


  Df Model:                    1                                     




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept     11.3535      0.045    254.165   0.000     11.266    11.441


  years         -0.3156      0.022    -14.255   0.000     -0.359    -0.272




  Omnibus:        193.361    Durbin-Watson:         1.787


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   2573.911


  Skew:           -0.213     Prob(JB):               0.00


  Kurtosis:       10.037     Cond. No.               4.69



In [343]:

    
xs = model.exog[:,1]
    ys = results.resid
    df = pandas.DataFrame(dict(xs=xs, ys=ys))
    df









    Out[343]:






  
    
      
      xs
      ys
    
  
  
    
      0 
       0.000000
       0.465443
    
    
      1 
       0.002740
       0.711054
    
    
      2 
       0.005479
       2.491114
    
    
      3 
       0.008219
       3.852401
    
    
      4 
       0.010959
       4.277022
    
    
      5 
       0.013699
       1.454977
    
    
      6 
       0.016438
       3.082432
    
    
      7 
       0.019178
       1.450888
    
    
      8 
       0.021918
      -7.018656
    
    
      9 
       0.024658
       4.281802
    
    
      10
       0.027397
       0.052259
    
    
      11
       0.030137
       2.247718
    
    
      12
       0.032877
       0.584909
    
    
      13
       0.035616
       0.085313
    
    
      14
       0.038356
      -0.111767
    
    
      15
       0.041096
       0.414558
    
    
      16
       0.043836
       0.148125
    
    
      17
       0.046575
       0.625406
    
    
      18
       0.049315
       0.480564
    
    
      19
       0.052055
       0.167461
    
    
      20
       0.054795
       0.250639
    
    
      21
       0.057534
      -0.181383
    
    
      22
       0.060274
      -0.282488
    
    
      23
       0.063014
      -0.585985
    
    
      24
       0.065753
       0.179916
    
    
      25
       0.068493
       0.759256
    
    
      26
       0.071233
      -1.298432
    
    
      27
       0.073973
       0.295715
    
    
      28
       0.076712
       0.348029
    
    
      29
       0.079452
       0.101499
    
    
      30
       0.082192
      -0.143547
    
    
      31
       0.084932
       0.128941
    
    
      32
       0.087671
       0.501490
    
    
      33
       0.090411
       0.170206
    
    
      34
       0.093151
      -0.230790
    
    
      35
       0.095890
       1.170705
    
    
      36
       0.098630
      -1.374425
    
    
      37
       0.101370
      -0.025143
    
    
      38
       0.104110
      -1.071267
    
    
      39
       0.106849
      -0.203062
    
    
      40
       0.109589
       0.734571
    
    
      41
       0.112329
      -1.071625
    
    
      42
       0.115068
       0.454933
    
    
      43
       0.117808
      -0.226399
    
    
      44
       0.120548
      -0.251214
    
    
      45
       0.123288
      -1.173061
    
    
      46
       0.126027
      -1.758051
    
    
      47
       0.128767
       0.156741
    
    
      48
       0.131507
      -0.415264
    
    
      49
       0.134247
       1.345376
    
    
      50
       0.136986
      -1.331353
    
    
      51
       0.139726
      -0.708725
    
    
      52
       0.142466
       0.200268
    
    
      53
       0.145205
      -2.121308
    
    
      54
       0.147945
       0.114788
    
    
      55
       0.150685
      -3.026490
    
    
      56
       0.153425
      -1.202670
    
    
      57
       0.156164
      -0.418135
    
    
      58
       0.158904
      -1.966140
    
    
      59
       0.161644
      -1.052760
    
    
      
      ...
      ...
    
  

1243 rows × 2 columns



In [115]:

    
years = model.exog[:,1]
thinkplot.Plot(years, results.resid)



In [139]:

    
ppg['years'] = ppg.index / 365.0
ppg['years2'] = ppg.years**2
model = smf.ols('ppg ~ years + years2', data=ppg)
results = model.fit()
results.summary()









    Out[139]:





OLS Regression Results

  Dep. Variable:            ppg          R-squared:             0.143


  Model:                    OLS          Adj. R-squared:        0.142


  Method:              Least Squares     F-statistic:           103.6


  Date:              Mon, 30 Jun 2014    Prob (F-statistic):  2.49e-42


  Time:                  09:54:11        Log-Likelihood:      -1493.7


  No. Observations:         1243         AIC:                   2993.


  Df Residuals:             1240         BIC:                   3009.


  Df Model:                    2                                     




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept     11.2623      0.066    171.106   0.000     11.133    11.391


  years         -0.1649      0.083     -1.988   0.047     -0.328    -0.002


  years2        -0.0417      0.022     -1.884   0.060     -0.085     0.002




  Omnibus:        187.232    Durbin-Watson:         1.792


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   2494.368


  Skew:           -0.155     Prob(JB):               0.00


  Kurtosis:        9.933     Cond. No.               27.4



In [140]:

    
years = model.exog[:,1]
thinkplot.Plot(years, results.resid)



In [144]:

    
ppg.loc[100]









    Out[144]:





ppg       10.759714
years      0.273973
years2     0.075061
Name: 100, dtype: float64



In [116]:

    
thinkstats2.Hist(np.diff(ppg.index.values)).Render()









    Out[116]:





[(1,), (1349,)]



In [263]:

    
from pandas.tools.plotting import autocorrelation_plot
autocorrelation_plot(ppg.ppg.diff().dropna())









    Out[263]:





<matplotlib.axes.AxesSubplot at 0xfc66b10>



In [118]:

    
high = df[df.quality=='high']
ca_high = high[high.state=='CA']
thinkplot.Scatter(ca_high.days, ca_high.ppg, alpha=0.05)
ma_high = high[high.state=='MA']
thinkplot.Scatter(ma_high.days, ma_high.ppg, alpha=0.05, color='red')



In [120]:

    
x = np.random.randn(100)
#x[50] = np.nan
y = np.random.randn(100)
np.corrcoef(x, y, ddof=0)









    Out[120]:





array([[ 1.        ,  0.00992923],
       [ 0.00992923,  1.        ]])



In [121]:

    
x = pandas.Series(np.random.randn(100))
x[50] = np.nan
y = pandas.Series(np.random.randn(100))
x.corr(y)









    Out[121]:





0.0084089711124760817



In [129]:

    
x = pandas.Series(np.random.randn(10))
#x[5] = np.nan
x1 = x.shift(1)
x









    Out[129]:





0    0.212373
1   -0.030408
2    0.283733
3   -1.717458
4    1.258489
5    0.129850
6    0.427242
7   -2.319428
8   -1.072773
9   -0.916255
dtype: float64



In [130]:

    
x1









    Out[130]:





0         NaN
1    0.212373
2   -0.030408
3    0.283733
4   -1.717458
5    1.258489
6    0.129850
7    0.427242
8   -2.319428
9   -1.072773
dtype: float64



In [131]:

    
x.corr(x1)









    Out[131]:





-0.11536934490458876



In [132]:

    
x2 = x.shift(2)
x.corr(x2)









    Out[132]:





0.12814083456920544



In [134]:

    
smtsa.acf(x)









    Out[134]:





array([ 1.        , -0.11485546,  0.10565009, -0.45566421,  0.26396369,
       -0.0605214 ,  0.00639969, -0.1605427 , -0.05506468, -0.02936502])



In [291]:

    
n = 10
ser = pandas.Series(np.ones(n))
mean = pandas.rolling_window(ser, 5, 'triang').mean()
np.testing.assert_approx_equal(mean, 1.0)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-291-17fa52b19e4e> in <module>()
      2 ser = pandas.Series(np.ones(n))
      3 mean = pandas.rolling_window(ser, 5, 'triang').mean()
----> 4 np.testing.assert_approx_equal(mean, 1.0)

/home/downey/anaconda/lib/python2.7/site-packages/numpy/testing/utils.pyc in assert_approx_equal(actual, desired, significant, err_msg, verbose)
    564         pass
    565     if np.abs(sc_desired - sc_actual) >= np.power(10., -(significant-1)) :
--> 566         raise AssertionError(msg)
    567 
    568 def assert_array_compare(comparison, x, y, err_msg='', verbose=True,

AssertionError: 
Items are not equal to 7 significant digits:
 ACTUAL: 0.6
 DESIRED: 1.0



In [285]:

    
mean = pandas.rolling_window(ser, 5, 'triang').mean()
np.testing.assert_approx_equal(mean, 1.0)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-285-b2b6cd42c94b> in <module>()
      1 mean = pandas.rolling_window(ser, 5, 'triang').mean()
----> 2 np.testing.assert_approx_equal(mean, 1.0)

/home/downey/anaconda/lib/python2.7/site-packages/numpy/testing/utils.pyc in assert_approx_equal(actual, desired, significant, err_msg, verbose)
    564         pass
    565     if np.abs(sc_desired - sc_actual) >= np.power(10., -(significant-1)) :
--> 566         raise AssertionError(msg)
    567 
    568 def assert_array_compare(comparison, x, y, err_msg='', verbose=True,

AssertionError: 
Items are not equal to 7 significant digits:
 ACTUAL: 0.6
 DESIRED: 1.0



In [288]:

    
mean = pandas.rolling_window(ser, 5, 'gaussian', std=1.5).mean()
np.testing.assert_approx_equal(mean, 1.0)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-288-07cc3e7b7428> in <module>()
      1 mean = pandas.rolling_window(ser, 5, 'gaussian', std=1.5).mean()
----> 2 np.testing.assert_approx_equal(mean, 1.0)

/home/downey/anaconda/lib/python2.7/site-packages/numpy/testing/utils.pyc in assert_approx_equal(actual, desired, significant, err_msg, verbose)
    564         pass
    565     if np.abs(sc_desired - sc_actual) >= np.power(10., -(significant-1)) :
--> 566         raise AssertionError(msg)
    567 
    568 def assert_array_compare(comparison, x, y, err_msg='', verbose=True,

AssertionError: 
Items are not equal to 7 significant digits:
 ACTUAL: 0.6847398773695982
 DESIRED: 1.0



In [289]:

    
mean = pandas.rolling_window(ser, 5, 'boxcar').mean()
np.testing.assert_approx_equal(mean, 1.0)



In [ ]:

	ppg	date	years
date
2014-04-19	11.841000	2014-04-19	3.627727
2014-04-20	10.712308	2014-04-20	3.630465
2014-04-21	10.984795	2014-04-21	3.633203
2014-04-22	10.760217	2014-04-22	3.635941
2014-04-23	10.920000	2014-04-23	3.638678
2014-04-24	10.250833	2014-04-24	3.641416
2014-04-25	10.482462	2014-04-25	3.644154
2014-04-26	11.379254	2014-04-26	3.646892
2014-04-27	11.482391	2014-04-27	3.649630
2014-04-28	10.784516	2014-04-28	3.652368
2014-04-29	11.750556	2014-04-29	3.655106
2014-04-30	11.212727	2014-04-30	3.657844
2014-05-01	10.286923	2014-05-01	3.660582
2014-05-02	12.260000	2014-05-02	3.663320
2014-05-03	9.942941	2014-05-03	3.666057
2014-05-04	10.552459	2014-05-04	3.668795
2014-05-05	11.290244	2014-05-05	3.671533
2014-05-06	11.199710	2014-05-06	3.674271
2014-05-07	12.064058	2014-05-07	3.677009
2014-05-08	9.953333	2014-05-08	3.679747
2014-05-09	11.468298	2014-05-09	3.682485
2014-05-10	10.532326	2014-05-10	3.685223
2014-05-11	11.518750	2014-05-11	3.687961
2014-05-12	10.578293	2014-05-12	3.690699
2014-05-13	9.604615	2014-05-13	3.693437

	ppg	ppg1	ppg7	ppg30	ppg365
ppg	1.000000	0.486785	0.485610	0.454358	0.337646
ppg1	0.486785	1.000000	0.466057	0.466313	0.294258
ppg7	0.485610	0.466057	1.000000	0.483190	0.259982
ppg30	0.454358	0.466313	0.483190	1.000000	0.320931
ppg365	0.337646	0.294258	0.259982	0.320931	1.000000

Dep. Variable:	resid	R-squared:	0.005
Model:	OLS	Adj. R-squared:	0.004
Method:	Least Squares	F-statistic:	4.820
Date:	Sat, 05 Jul 2014	Prob (F-statistic):	0.0284
Time:	10:55:55	Log-Likelihood:	-1199.5
No. Observations:	985	AIC:	2403.
Df Residuals:	983	BIC:	2413.
Df Model:	1

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	0.0095	0.026	0.364	0.716	-0.042 0.061
resid365	0.0789	0.036	2.195	0.028	0.008 0.149

Omnibus:	31.735	Durbin-Watson:	2.020
Prob(Omnibus):	0.000	Jarque-Bera (JB):	76.140
Skew:	0.071	Prob(JB):	2.93e-17
Kurtosis:	4.355	Cond. No.	1.38

Dep. Variable:	ppg	R-squared:	0.353
Model:	OLS	Adj. R-squared:	0.351
Method:	Least Squares	F-statistic:	239.0
Date:	Fri, 04 Jul 2014	Prob (F-statistic):	8.82e-124
Time:	19:39:03	Log-Likelihood:	-1709.6
No. Observations:	1320	AIC:	3427.
Df Residuals:	1316	BIC:	3448.
Df Model:	3

Omnibus:	71.414	Durbin-Watson:	2.258
Prob(Omnibus):	0.000	Jarque-Bera (JB):	193.744
Skew:	0.254	Prob(JB):	8.49e-43
Kurtosis:	4.807	Cond. No.	294.

Dep. Variable:	D.ppg	No. Observations:	1240
Model:	ARIMA(7, 1, 2)	Log Likelihood	-1487.759
Method:	css-mle	S.D. of innovations	0.802
Date:	Fri, 04 Jul 2014	AIC	2997.517
Time:	15:54:01	BIC	3053.869
Sample:	09-03-2010	HQIC	3018.710
	- 05-13-2014

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
const	-0.0022	0.001	-2.994	0.003	-0.004 -0.001
ar.L1.D.ppg	-0.6489	0.260	-2.499	0.013	-1.158 -0.140
ar.L2.D.ppg	-0.0144	0.039	-0.365	0.715	-0.091 0.063
ar.L3.D.ppg	-0.0052	0.040	-0.130	0.896	-0.083 0.073
ar.L4.D.ppg	-0.0224	0.039	-0.578	0.563	-0.098 0.054
ar.L5.D.ppg	-0.0256	0.040	-0.643	0.520	-0.103 0.052
ar.L6.D.ppg	-0.0238	0.039	-0.618	0.537	-0.100 0.052
ar.L7.D.ppg	0.0100	0.033	0.303	0.762	-0.055 0.075
ma.L1.D.ppg	-0.3050	0.258	-1.181	0.238	-0.811 0.201
ma.L2.D.ppg	-0.6419	0.248	-2.585	0.010	-1.129 -0.155

AR.1
	Real	Imaginary	Modulus	Frequency
AR.2
AR.3
AR.4
AR.5
AR.6
AR.7
MA.1
MA.2

Dep. Variable:	ppg	R-squared:	0.141
Model:	OLS	Adj. R-squared:	0.140
Method:	Least Squares	F-statistic:	203.2
Date:	Mon, 30 Jun 2014	Prob (F-statistic):	8.17e-43
Time:	09:42:45	Log-Likelihood:	-1495.5
No. Observations:	1243	AIC:	2995.
Df Residuals:	1241	BIC:	3005.
Df Model:	1

Omnibus:	193.361	Durbin-Watson:	1.787
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2573.911
Skew:	-0.213	Prob(JB):	0.00
Kurtosis:	10.037	Cond. No.	4.69

	xs	ys
0	0.000000	0.465443
1	0.002740	0.711054
2	0.005479	2.491114
3	0.008219	3.852401
4	0.010959	4.277022
5	0.013699	1.454977
6	0.016438	3.082432
7	0.019178	1.450888
8	0.021918	-7.018656
9	0.024658	4.281802
10	0.027397	0.052259
11	0.030137	2.247718
12	0.032877	0.584909
13	0.035616	0.085313
14	0.038356	-0.111767
15	0.041096	0.414558
16	0.043836	0.148125
17	0.046575	0.625406
18	0.049315	0.480564
19	0.052055	0.167461
20	0.054795	0.250639
21	0.057534	-0.181383
22	0.060274	-0.282488
23	0.063014	-0.585985
24	0.065753	0.179916
25	0.068493	0.759256
26	0.071233	-1.298432
27	0.073973	0.295715
28	0.076712	0.348029
29	0.079452	0.101499
30	0.082192	-0.143547
31	0.084932	0.128941
32	0.087671	0.501490
33	0.090411	0.170206
34	0.093151	-0.230790
35	0.095890	1.170705
36	0.098630	-1.374425
37	0.101370	-0.025143
38	0.104110	-1.071267
39	0.106849	-0.203062
40	0.109589	0.734571
41	0.112329	-1.071625
42	0.115068	0.454933
43	0.117808	-0.226399
44	0.120548	-0.251214
45	0.123288	-1.173061
46	0.126027	-1.758051
47	0.128767	0.156741
48	0.131507	-0.415264
49	0.134247	1.345376
50	0.136986	-1.331353
51	0.139726	-0.708725
52	0.142466	0.200268
53	0.145205	-2.121308
54	0.147945	0.114788
55	0.150685	-3.026490
56	0.153425	-1.202670
57	0.156164	-0.418135
58	0.158904	-1.966140
59	0.161644	-1.052760
	...	...

Dep. Variable:	ppg	R-squared:	0.143
Model:	OLS	Adj. R-squared:	0.142
Method:	Least Squares	F-statistic:	103.6
Date:	Mon, 30 Jun 2014	Prob (F-statistic):	2.49e-42
Time:	09:54:11	Log-Likelihood:	-1493.7
No. Observations:	1243	AIC:	2993.
Df Residuals:	1240	BIC:	3009.
Df Model:	2

Omnibus:	187.232	Durbin-Watson:	1.792
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2494.368
Skew:	-0.155	Prob(JB):	0.00
Kurtosis:	9.933	Cond. No.	27.4