In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (15.0, 15.0)

In [2]:
from matplotlib import rc
rc('font', size=14)

In [3]:
from numpy import loadtxt, arange, log, delete, array, ones, concatenate
from numpy.linalg import lstsq
from scipy.stats import f
from scipy.stats.mstats import normaltest
from pandas.stats.moments import ewma
from pandas import Series

In [4]:
from common import *

In [5]:
MAX_POW = 2
CRITICAL_F = 4.2

In [6]:
dataset = loadtxt('data.csv', skiprows=1, delimiter=',')
t = arange(dataset.shape[0])
y = dataset[:, 0]
xs = dataset[:, 1:].transpose()
XS = [TimeSeries(x) for x in xs]

In [7]:
for i, x in enumerate(XS):
    plt.plot(t, x.data, label='$X_%d$'%(i+1))

plt.title('Independent variables')
plt.ylabel('Value')
plt.xlabel('Time')
plt.legend()
plt.show()



In [8]:
for i, x in enumerate(XS):
    i += 1
    plt.subplot(320 + i)
    plt.title(r'$X_{%d}$ smoothed with $%d$, $\sigma=%2.2f$'%(i, x.span, x.trend_errors.std()))
    plt.plot(x.data)
    plt.plot(x.trend_forecasted)
plt.show()



In [9]:
for i, x in enumerate(XS):
    i += 1
    plt.subplot(320 + i)
    plt.title(r'$X_{%d}$ autocorrelation'%i)
    plt.plot(x.autocorrelation)
    plt.axhline(linestyle='--')
    plt.xlabel('Lag')
plt.show()



In [10]:
for i, x in enumerate(XS):
    i += 1
    plt.subplot(320 + i)
    plt.title(r'$X_{%d}$ estimate, $\sigma=%2.2f$'%(i, x.estimate_errors.std()))
    plt.plot(x.data)
    plt.plot(x.data_forecasted)
plt.show()



In [11]:
y_without_anomalies = y.copy()
for anomaly_point in (2, 12, 4):
    y_without_anomalies[anomaly_point] = .5 * (
        y_without_anomalies[anomaly_point - 1] + y_without_anomalies[anomaly_point + 1])

In [12]:
accum = [X.data_forecasted for X in XS]
names = ['X[%d]'%i for i in range(len(XS))]
sequences = [X.data_forecasted for X in XS]
for i in range(len(accum)):
    for j in range(i, len(accum)):
        sequences.append(accum[i] * accum[j])
        names.append('X[%d] * X[%d]'%(i, j))
sequences = array(sequences)

In [13]:
new_sequences = sequences.copy()
coefficients = [y_without_anomalies.mean()]
regressors = [ones(XS[0].data_forecasted.size)]
all_coefficients = [coefficients[0]]
all_regressors = [regressors[0]]
tmp_names = names.copy()
for i in range(min(sequences.shape[0], y.size)):
    regressors, coefficients, new_sequences, F, best = get_coeffs(y_without_anomalies, regressors, coefficients, new_sequences)
    all_coefficients.append(coefficients)
    all_regressors.append(regressors[-1])
    print('Step %d: the best is %s'%(i+1, tmp_names[best]))
    tmp_names = delete(tmp_names, best)
    if F < CRITICAL_F:
        print('Coefficients', coefficients.tolist())
        break
else:
    print('Best solution not found')


Step 1: the best is X[1] * X[5]
Step 2: the best is X[2] * X[4]
Step 3: the best is X[4] * X[5]
Coefficients [-3.3297720461223634, 1.7533891573018359, 15.590866650869222, 22.668791421642325]

In [14]:
# x_ = [ones(XS[0].data_forecasted.size),
#       XS[1].data_forecasted * XS[5].data_forecasted,
#       XS[2].data_forecasted * XS[4].data_forecasted,
#       XS[4].data_forecasted * XS[5].data_forecasted]
y_estimate = coefficients.dot(regressors)
y_estimate = coefficients.dot(x_)

plt.plot(y_without_anomalies, label='$Y$ time series')
plt.plot(y_estimate, label='$Y$ estimate and prediction by $X$')

plt.legend()
plt.show()



NameErrorTraceback (most recent call last)
<ipython-input-14-40e5e3f7d737> in <module>()
      4 #       XS[4].data_forecasted * XS[5].data_forecasted]
      5 y_estimate = coefficients.dot(regressors)
----> 6 y_estimate = coefficients.dot(x_)
      7 
      8 plt.plot(y_without_anomalies, label='$Y$ time series')

NameError: name 'x_' is not defined

In [ ]:
for i, x in enumerate(XS):
    i += 1
    plt.subplot(320 + i)
    plt.title(r'$X_{%d}$ three steps forecast'%i)
    plt.plot(x.data)
    plt.plot(x.forecast_test)
plt.show()