Initial setup



In [1]:

    
## Setup the path for our codebase
import sys
sys.path.append( '../code/' )



In [2]:

    
## import our time_series codebase
import time_series.generated_datasets
import time_series.result_set
import time_series.algorithm



In [3]:

    
dataset_0 = time_series.generated_datasets.DSS[0]( 10 )



In [6]:

    
dataset_0.taxonomy









    Out[6]:





["Dset{ non-periodic, non-trend, 10000.0, 100.0 noise = ['Norm(0.0,100.0)']( Constant[10000.0](1.0*t+0) ) }",
 'Induced[Set(0.0 @ 1.0, 1.0 )]']



In [7]:

    
%matplotlib inline
import matplotlib.pyplot as plt



In [8]:

    
time_series.generated_datasets.plot_dataset( dataset_0 )

Ploynomial Least-Squares Fit Algorithm



In [9]:

    
## import numpy for polyfit
import numpy as np



In [10]:

    
##
# Define a new algorithm for a polynomial least-squares fit
class PolyFitAlg( time_series.algorithm.AlertAlgorithm):
    def __init__(self, order):
        self.order = order
        time_series.algorithm.AlertAlgorithm.__init__( self, "Polyfit[{0}]".format(order) )
    def __call__( self, target, history ):
        # fit the polynomail to history
        n = len(history)
        poly = np.poly1d( np.polyfit( xrange(n), history, self.order ) )
        expected = poly(n)
        difference = abs(target - expected)
        if target != 0:
            fraction = difference / abs(target)
        else:
            # Assume target is actuall 1, so absolute difference instead of fraction
            fraction = difference
        result = {
            'target' : target,
            'expected' : expected,
            'order' : self.order,
            'difference' : difference,
            'fraction' : fraction,
            'poly' : poly,
        }
        return fraction, result



In [11]:

    
alg_pf = PolyFitAlg( 4 )
frac, res = alg_pf( 10.0, xrange(10) )



In [12]:

    
res









    Out[12]:





{'difference': 7.1054273576010019e-15,
 'expected': 10.000000000000007,
 'fraction': 7.1054273576010023e-16,
 'order': 4,
 'poly': poly1d([  1.44304281e-17,  -2.37052649e-16,   1.12971061e-15,
          1.00000000e+00,  -7.02166694e-16]),
 'target': 10.0}



In [13]:

    
plt.plot( xrange(13), res['poly']( xrange(13) ))









    Out[13]:





[<matplotlib.lines.Line2D at 0x1094fdfd0>]



In [14]:

    
frac,res = alg_pf( dataset_0.time_series[-1], dataset_0.time_series[:-1] )
res









    Out[14]:





{'difference': 10130.741780839411,
 'expected': 10130.741780839411,
 'fraction': 10130.741780839411,
 'order': 4,
 'poly': poly1d([ -3.29934214e-01,   7.82653233e+00,  -5.55432287e+01,
          1.16630407e+02,   1.00392259e+04]),
 'target': 0.0}



In [15]:

    
n = len(dataset_0.time_series)
plt.plot( xrange(n), res['poly']( xrange(n)) )
plt.hold( True )
plt.plot( xrange(n-1), dataset_0.time_series[:-1], 'r.' )









    Out[15]:





[<matplotlib.lines.Line2D at 0x1095c5a10>]



In [16]:

    
dataset_0.taxonomy









    Out[16]:





["Dset{ non-periodic, non-trend, 10000.0, 100.0 noise = ['Norm(0.0,100.0)']( Constant[10000.0](1.0*t+0) ) }",
 'Induced[Set(0.0 @ 1.0, 1.0 )]']



In [17]:

    
n = len(dataset_0.time_series)
plt.plot( xrange(n), res['poly']( xrange(n)) )
plt.hold( True )
plt.plot( xrange(n), dataset_0.time_series, 'r.' )









    Out[17]:





[<matplotlib.lines.Line2D at 0x109463290>]



In [18]:

    
alg_pf8 = PolyFitAlg( 8 )
frac8,res8 = alg_pf8( dataset_0.time_series[-1], dataset_0.time_series[:-1] )
n = len(dataset_0.time_series)
plt.figure()
plt.plot( xrange(n-1), res8['poly']( xrange(n-1)) )
plt.hold( True )
plt.plot( xrange(n-1), dataset_0.time_series[:-1], 'r.', ms=10 )
plt.figure()
plt.plot( xrange(n + 2), res8['poly']( xrange(n + 2)) )
plt.hold( True )
plt.plot( xrange(n), dataset_0.time_series, 'r.', ms=10 )









    Out[18]:





[<matplotlib.lines.Line2D at 0x109b72150>]

Mean-Square Error as Measure of Fit



In [19]:

    
## compute the mean squared error for a history and a PolyFit algorithm
def polyfit_mse( alg, history ):
    # first fit the algorithm with a dummy target
    frac, res = alg( history[-1], history )
    
    # ok, grab polynomial from fit and compute errors
    poly = res['poly']
    x = xrange(len(history))
    errors = np.array(history) - poly(x)
    
    # compute mean squared error
    mse = np.mean( errors * errors.transpose() )
    return mse



In [20]:

    
mse_pf4 = polyfit_mse( alg_pf, dataset_0.time_series[:-1] )
mse_pf8 = polyfit_mse( alg_pf8, dataset_0.time_series[:-1] )
print "order 4 MSE: {0}".format( mse_pf4 )
print "order 8 MSE: {0}".format( mse_pf8 )









    



order 4 MSE: 3353.73411832
order 8 MSE: 1.78470794163e-17

Best Historical MSE = Best Future Performance?



In [21]:

    
run_spec_pf4 = time_series.result_set.RunSpec( time_series.generated_datasets.DSS[0], alg_pf)
run_spec_pf8 = time_series.result_set.RunSpec( time_series.generated_datasets.DSS[0], alg_pf8)
rset_pf4 = run_spec_pf4.collect_results( 20, 5, 9 )
rset_pf8 = run_spec_pf8.collect_results( 20, 5, 9 )
stats_pf4 = time_series.result_set.compute_classifier_stats( rset_pf4, 0.5 )
stats_pf8 = time_series.result_set.compute_classifier_stats( rset_pf8, 0.5 )
print "order 4 stats: {0}".format( stats_pf4 )
print "order 8 stats: {0}".format( stats_pf8 )









    



order 4 stats: {'tn': 45.0, 'fp': 0.0, 'fn': 0.0, 'recall': 1.0, 'fall_out': 0.0, 'precision': 1.0, 'tp': 5.0, 'accuracy': 1.0}
order 8 stats: {'tn': 39.0, 'fp': 6.0, 'fn': 0.0, 'recall': 1.0, 'fall_out': 0.13333333333333333, 'precision': 0.45454545454545453, 'tp': 5.0, 'accuracy': 0.88}

Overfitting!

So what now? This is where we use all of the methods for preventing overfitting:

Cross-validation based model selection
AIC/BIC all those information criteria which weight model fit versus moder complexity
Algorithm Stability ( stability == generalizability )
Statistical Learning Theory to learn using Structures (Structure Learning Theory)
Choose a better performance criteria than historical fit



In [ ]:

    
aic( alg ) =  MSE( alg ) + log(N) * COMPLEXITY