In [1]:
import pandas as pd
import numpy as np
import pyaf.ForecastEngine as autof
import pyaf.Bench.TS_datasets as tsds
#import SignalDecomposition_Perf as tsperf


%matplotlib inline

In [2]:
import datetime


def convert_double_to_datetime(x):
    ratio = (x - int(x))
    fulldate = datetime.datetime(int(x), 1, 1, 0, 0, 0)
    year_length = datetime.datetime(int(x) + 1, 1, 1, 0, 0, 0) - fulldate
    fulldate = fulldate + datetime.timedelta(days = int(year_length.days*ratio))
    return fulldate

In [3]:
# examples using the autoforecast public API on some datasets (coming from R). see the link below for more datasets.
def analyzeTimeSeriesDataset(filename, horizon):
    # filename = AirPassengers.csv
    lCSVFile = "https://raw.githubusercontent.com/antoinecarme/TimeSeriesData/master/R_TSData/" + filename;
    # get the CSV file in a pandas datafrmae
    df = pd.read_csv(lCSVFile, sep=r',', engine='python');
    # in R_TSDAta , the date column is the first column
    lDateCol = df.columns[0];
    df[lDateCol] = df[lDateCol].apply(lambda x : convert_double_to_datetime(x))
    # length of the time series
    lLength = df.shape[0];
    # use only the N - H first rows for prediction, predict the H last and compare with actual values
    lTrainDataset = df[0:lLength - horizon];
    for col in df.columns:
        if(col != lDateCol):
            # forecast each cloumn separately in ths demo
            lSignalCol = col;
            # create a model(autoforecast object) ..... handle all the process
            lAutoF = autof.cForecastEngine()
            # set soem options
            lAutoF.mOptions.mEnableSeasonals = True;
            #lAutoF.mOptions.enable_slow_mode()
            #lAutoF.mOptions.mCycle_Criterion = "L2";
            #lAutoF.mOptions.mCycle_Criterion_Threshold = 10000.2;
            # train the model
            lAutoF.train(lTrainDataset , lDateCol , lSignalCol, horizon)
            # get some mdoel info
            lAutoF.getModelInfo();
            # access some advanced info ... for aficionados only!!!
            lAutoF.mSignalDecomposition.mBestModel.mTimeInfo.mResolution
            # define an input dataframe
            lInput = lTrainDataset.copy();
            #print(lInput.tail())
            # output dataframe ('forecast' API call ;)
            lOutput = lAutoF.forecast(lInput, horizon);
            print("Forecast Columns " , lOutput.columns);
            # Here , in the output dataframe, we keep only the data, signal and forecast outputs
            lForecastCol = lSignalCol + '_Forecast';
            lForecastDataFrame = lOutput[[lDateCol , lSignalCol, lForecastCol]]
            print(lForecastDataFrame.info())
            # actual values
            print("Actual : \n" , df[[lDateCol , lSignalCol]].tail(horizon).values);
            # predcit values
            print("Predicted : \n" , lForecastDataFrame.tail(horizon).values);

            # serialize the model as json
            print("\n\n<ModelInfo>")
            print(lAutoF.to_json());
            print("</ModelInfo>\n\n")

            # serialize the forecasts as json
            print("\n\n<Forecast>")
            print(lForecastDataFrame.tail(horizon).to_json(date_format='iso'))
            print("</Forecast>\n\n")
            
            # some plots
            lAutoF.standrdPlots()
            
            lActualAndPredictedDF = pd.DataFrame();
            lActualAndPredictedDF['Date'] = df[lDateCol];
            lActualAndPredictedDF['Signal'] = df[lSignalCol];
            lActualAndPredictedDF['Forecast'] = lForecastDataFrame[lForecastCol];
            lActualAndPredictedDF['Residue'] = lActualAndPredictedDF['Forecast'] - lActualAndPredictedDF['Signal'];
            print(lActualAndPredictedDF.tail(horizon).values)
            lActualAndPredictedDF.plot.line('Date', ['Signal', 'Forecast' , 'Residue'], figsize=(32, 16))
            
    return None;

In [4]:
R_TSeries = "AirPassengers.csv austres.csv beaver1.csv beaver2.csv BJsales.csv co2.csv DM.csv EuStockMarkets.csv \
            fdeaths.csv JohnsonJohnson.csv LakeHuron.csv ldeaths.csv lh.csv lynx.csv mdeaths.csv Nile.csv nottem.csv \
            sunspot.month.csv sunspots.csv sunspot.year.csv treering.csv UKDriverDeaths.csv UKgas.csv USAccDeaths.csv \
            WWWusage.csv".split();

analyzeTimeSeriesDataset("sunspots.csv" , 7)


Forecast Columns  Index(['Index', 'RelDiff_x', 'x', 'row_number', 'Index_Normalized',
       'RelDiff_x_Lag1Trend', 'RelDiff_x_Lag1Trend_residue',
       'RelDiff_x_Lag1Trend_residue_Seasonal_DayOfWeek',
       'RelDiff_x_Lag1Trend_residue_Seasonal_DayOfWeek_residue',
       'RelDiff_x_Lag1Trend_residue_Seasonal_DayOfWeek_residue_NoAR',
       'RelDiff_x_Lag1Trend_residue_Seasonal_DayOfWeek_residue_NoAR_residue',
       'RelDiff_x_Trend', 'RelDiff_x_Trend_residue', 'RelDiff_x_Cycle',
       'RelDiff_x_Cycle_residue', 'RelDiff_x_AR', 'RelDiff_x_AR_residue',
       'RelDiff_x_TransformedForecast', 'RelDiff_x_TransformedResidue',
       'x_Forecast', 'x_Residue', 'x_Forecast_Lower_Bound',
       'x_Forecast_Upper_Bound'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2820 entries, 0 to 2819
Data columns (total 3 columns):
Index         2820 non-null datetime64[ns]
x             2813 non-null float64
x_Forecast    2820 non-null float64
dtypes: datetime64[ns](1), float64(2)
memory usage: 66.2 KB
None
Actual : 
 [[Timestamp('1983-06-02 00:00:00') 91.1]
 [Timestamp('1983-07-02 00:00:00') 82.2]
 [Timestamp('1983-08-01 00:00:00') 71.8]
 [Timestamp('1983-09-01 00:00:00') 50.3]
 [Timestamp('1983-10-01 00:00:00') 55.8]
 [Timestamp('1983-11-01 00:00:00') 33.3]
 [Timestamp('1983-12-01 00:00:00') 33.4]]
Predicted : 
 [[Timestamp('1983-06-01 10:27:52') nan 94.71443661895684]
 [Timestamp('1983-07-01 20:55:44') nan 90.28531856275191]
 [Timestamp('1983-08-01 07:23:36') nan 85.27260898272277]
 [Timestamp('1983-08-31 17:51:28') nan 80.20437977365644]
 [Timestamp('1983-10-01 04:19:20') nan 76.13487919589356]
 [Timestamp('1983-10-31 14:47:12') nan 71.50139695695702]
 [Timestamp('1983-12-01 01:15:04') nan 67.41001598031497]]


<ModelInfo>
{
    "Dataset": {
        "Signal": "x",
        "Time": {
            "Horizon": 7,
            "TimeMinMax": [
                "1749-01-01 00:00:00",
                "1983-05-02 00:00:00"
            ],
            "TimeVariable": "Index"
        },
        "Training_Signal_Length": 2813
    },
    "Model": {
        "AR_Model": "NoAR",
        "Best_Decomposition": "RelDiff_x_Lag1Trend_residue_Seasonal_DayOfWeek_residue_NoAR",
        "Cycle": "Seasonal_DayOfWeek",
        "Signal_Transoformation": "RelativeDifference",
        "Trend": "Lag1Trend"
    },
    "Model_Performance": {
        "COMPLEXITY": "68",
        "MAPE": "0.3663",
        "RMSE": "20.4019812735"
    }
}
</ModelInfo>




<Forecast>
{"Index":{"2813":"1983-06-01T10:27:52.000Z","2814":"1983-07-01T20:55:44.000Z","2815":"1983-08-01T07:23:36.000Z","2816":"1983-08-31T17:51:28.000Z","2817":"1983-10-01T04:19:20.000Z","2818":"1983-10-31T14:47:12.000Z","2819":"1983-12-01T01:15:04.000Z"},"x":{"2813":null,"2814":null,"2815":null,"2816":null,"2817":null,"2818":null,"2819":null},"x_Forecast":{"2813":94.714436619,"2814":90.2853185628,"2815":85.2726089827,"2816":80.2043797737,"2817":76.1348791959,"2818":71.501396957,"2819":67.4100159803}}
</Forecast>


/usr/lib/python3/dist-packages/matplotlib/__init__.py:1403: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)
[[Timestamp('1983-06-02 00:00:00') 91.1 94.71443661895684
  3.6144366189568444]
 [Timestamp('1983-07-02 00:00:00') 82.2 90.28531856275191 8.085318562751908]
 [Timestamp('1983-08-01 00:00:00') 71.8 85.27260898272277
  13.472608982722775]
 [Timestamp('1983-09-01 00:00:00') 50.3 80.20437977365644
  29.904379773656444]
 [Timestamp('1983-10-01 00:00:00') 55.8 76.13487919589356
  20.334879195893564]
 [Timestamp('1983-11-01 00:00:00') 33.3 71.50139695695702
  38.201396956957026]
 [Timestamp('1983-12-01 00:00:00') 33.4 67.41001598031497 34.01001598031497]]

In [5]:
# lCSVFile = "https://raw.githubusercontent.com/antoinecarme/TimeSeriesData/master/R_TSData/" + "AirPassengers.csv";
# df = pd.read_csv(lCSVFile, sep=r',', engine='python');
# df.info()

In [6]:
# df.Index

In [7]:
# a = df.Index.apply(lambda x : convert_double_to_datetime(x))
# a

In [ ]: