Initial setup


In [1]:
## Setup the path for our codebase
import sys
sys.path.append( '../code/' )

In [2]:
## import our time_series codebase
import time_series.generated_datasets
import time_series.result_set
import time_series.algorithm

In [3]:
dataset_0 = time_series.generated_datasets.DSS[0]( 10 )

In [6]:
dataset_0.taxonomy


Out[6]:
["Dset{ non-periodic, non-trend, 10000.0, 100.0 noise = ['Norm(0.0,100.0)']( Constant[10000.0](1.0*t+0) ) }",
 'Induced[Set(0.0 @ 1.0, 1.0 )]']

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt

In [8]:
time_series.generated_datasets.plot_dataset( dataset_0 )


Ploynomial Least-Squares Fit Algorithm


In [9]:
## import numpy for polyfit
import numpy as np

In [10]:
##
# Define a new algorithm for a polynomial least-squares fit
class PolyFitAlg( time_series.algorithm.AlertAlgorithm):
    def __init__(self, order):
        self.order = order
        time_series.algorithm.AlertAlgorithm.__init__( self, "Polyfit[{0}]".format(order) )
    def __call__( self, target, history ):
        # fit the polynomail to history
        n = len(history)
        poly = np.poly1d( np.polyfit( xrange(n), history, self.order ) )
        expected = poly(n)
        difference = abs(target - expected)
        if target != 0:
            fraction = difference / abs(target)
        else:
            # Assume target is actuall 1, so absolute difference instead of fraction
            fraction = difference
        result = {
            'target' : target,
            'expected' : expected,
            'order' : self.order,
            'difference' : difference,
            'fraction' : fraction,
            'poly' : poly,
        }
        return fraction, result

In [11]:
alg_pf = PolyFitAlg( 4 )
frac, res = alg_pf( 10.0, xrange(10) )

In [12]:
res


Out[12]:
{'difference': 7.1054273576010019e-15,
 'expected': 10.000000000000007,
 'fraction': 7.1054273576010023e-16,
 'order': 4,
 'poly': poly1d([  1.44304281e-17,  -2.37052649e-16,   1.12971061e-15,
          1.00000000e+00,  -7.02166694e-16]),
 'target': 10.0}

In [13]:
plt.plot( xrange(13), res['poly']( xrange(13) ))


Out[13]:
[<matplotlib.lines.Line2D at 0x1094fdfd0>]

In [14]:
frac,res = alg_pf( dataset_0.time_series[-1], dataset_0.time_series[:-1] )
res


Out[14]:
{'difference': 10130.741780839411,
 'expected': 10130.741780839411,
 'fraction': 10130.741780839411,
 'order': 4,
 'poly': poly1d([ -3.29934214e-01,   7.82653233e+00,  -5.55432287e+01,
          1.16630407e+02,   1.00392259e+04]),
 'target': 0.0}

In [15]:
n = len(dataset_0.time_series)
plt.plot( xrange(n), res['poly']( xrange(n)) )
plt.hold( True )
plt.plot( xrange(n-1), dataset_0.time_series[:-1], 'r.' )


Out[15]:
[<matplotlib.lines.Line2D at 0x1095c5a10>]

In [16]:
dataset_0.taxonomy


Out[16]:
["Dset{ non-periodic, non-trend, 10000.0, 100.0 noise = ['Norm(0.0,100.0)']( Constant[10000.0](1.0*t+0) ) }",
 'Induced[Set(0.0 @ 1.0, 1.0 )]']

In [17]:
n = len(dataset_0.time_series)
plt.plot( xrange(n), res['poly']( xrange(n)) )
plt.hold( True )
plt.plot( xrange(n), dataset_0.time_series, 'r.' )


Out[17]:
[<matplotlib.lines.Line2D at 0x109463290>]

In [18]:
alg_pf8 = PolyFitAlg( 8 )
frac8,res8 = alg_pf8( dataset_0.time_series[-1], dataset_0.time_series[:-1] )
n = len(dataset_0.time_series)
plt.figure()
plt.plot( xrange(n-1), res8['poly']( xrange(n-1)) )
plt.hold( True )
plt.plot( xrange(n-1), dataset_0.time_series[:-1], 'r.', ms=10 )
plt.figure()
plt.plot( xrange(n + 2), res8['poly']( xrange(n + 2)) )
plt.hold( True )
plt.plot( xrange(n), dataset_0.time_series, 'r.', ms=10 )


Out[18]:
[<matplotlib.lines.Line2D at 0x109b72150>]

Mean-Square Error as Measure of Fit


In [19]:
## compute the mean squared error for a history and a PolyFit algorithm
def polyfit_mse( alg, history ):
    # first fit the algorithm with a dummy target
    frac, res = alg( history[-1], history )
    
    # ok, grab polynomial from fit and compute errors
    poly = res['poly']
    x = xrange(len(history))
    errors = np.array(history) - poly(x)
    
    # compute mean squared error
    mse = np.mean( errors * errors.transpose() )
    return mse

In [20]:
mse_pf4 = polyfit_mse( alg_pf, dataset_0.time_series[:-1] )
mse_pf8 = polyfit_mse( alg_pf8, dataset_0.time_series[:-1] )
print "order 4 MSE: {0}".format( mse_pf4 )
print "order 8 MSE: {0}".format( mse_pf8 )


order 4 MSE: 3353.73411832
order 8 MSE: 1.78470794163e-17

Best Historical MSE = Best Future Performance?


In [21]:
run_spec_pf4 = time_series.result_set.RunSpec( time_series.generated_datasets.DSS[0], alg_pf)
run_spec_pf8 = time_series.result_set.RunSpec( time_series.generated_datasets.DSS[0], alg_pf8)
rset_pf4 = run_spec_pf4.collect_results( 20, 5, 9 )
rset_pf8 = run_spec_pf8.collect_results( 20, 5, 9 )
stats_pf4 = time_series.result_set.compute_classifier_stats( rset_pf4, 0.5 )
stats_pf8 = time_series.result_set.compute_classifier_stats( rset_pf8, 0.5 )
print "order 4 stats: {0}".format( stats_pf4 )
print "order 8 stats: {0}".format( stats_pf8 )


order 4 stats: {'tn': 45.0, 'fp': 0.0, 'fn': 0.0, 'recall': 1.0, 'fall_out': 0.0, 'precision': 1.0, 'tp': 5.0, 'accuracy': 1.0}
order 8 stats: {'tn': 39.0, 'fp': 6.0, 'fn': 0.0, 'recall': 1.0, 'fall_out': 0.13333333333333333, 'precision': 0.45454545454545453, 'tp': 5.0, 'accuracy': 0.88}

Overfitting!

So what now? This is where we use all of the methods for preventing overfitting:

  • Cross-validation based model selection
  • AIC/BIC all those information criteria which weight model fit versus moder complexity
  • Algorithm Stability ( stability == generalizability )
  • Statistical Learning Theory to learn using Structures (Structure Learning Theory)
  • Choose a better performance criteria than historical fit

In [ ]:
aic( alg ) =  MSE( alg ) + log(N) * COMPLEXITY