In [ ]:
import pandas as pd
import numpy as np

In [ ]:
def load_airline_passengers() :
    trainfile = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/AirPassengers.csv"

    cols = ["ID" , "time", "AirPassengers"];

    df_train = pd.read_csv(trainfile, names = cols, sep=r',', index_col='ID', engine='python', skiprows=1);

    df_train.to_csv("passengers_train.csv");

    return df_train

In [ ]:
df = load_airline_passengers()

In [ ]:


In [ ]:
df.head()

In [ ]:
from datetime import datetime, date, time
min_val = int(df.time.min())
first_date = date(min_val, 1 , 1)

def to_date_passengers(x):
    return  date(int(x) , int(12 * (x - int(x) + 0.01)) + 1 , 1)

df['real_time'] = df['time'].apply(to_date_passengers)

In [ ]:
#pd.read_csv?

In [ ]:
#df['AirPassengers'] = np.log(df['AirPassengers'])

In [ ]:
%matplotlib inline

In [ ]:
first_date

In [ ]:
#df.plot?

In [ ]:
df.plot.line('real_time' , 'AirPassengers')

In [ ]:
def computePerf(signal , estimator):
    MAPE = np.mean(abs((signal - estimator) / signal))
    return MAPE

Trend Modeling


In [ ]:
import sklearn as skl
import sklearn.preprocessing as preprocessing
import sklearn.linear_model as linear_model

ridge = linear_model.Ridge()
ridge.fit(df[['time']].values, df[['AirPassengers']].values)

Y_pred = ridge.predict(df[['time']])

In [ ]:
ridge.score(df[['time']].values, df[['AirPassengers']].values)

In [ ]:
df['Trend'] = Y_pred
df['Trend_residue'] = df[['AirPassengers']].values - Y_pred

In [ ]:
MAPE_Trend = computePerf(df['AirPassengers'] , df['Trend'])
MAPE_Trend

In [ ]:
df[['Trend_residue']].describe()

In [ ]:
df.plot.line('real_time' , ['AirPassengers' , 'Trend' , 'Trend_residue'])

In [ ]:
df.describe()

Cyclic Modeling


In [ ]:
df.head(12)

In [ ]:
def generate_cycles(dframe, signal, K):
    perfs = pd.DataFrame()
    cycle_frame = pd.DataFrame()
    cycle_frame[signal] = dframe[signal]
    MAPEDict = {}
    for i in range(K):
        if i > 1:
            name = 'Cycle_' + str(i)
            cycle_frame[name] = np.arange(0,cycle_frame.shape[0]) % i
            cycle_frame[name + '_enc'] = cycle_frame[[name]].join(cycle_frame.groupby([name])[signal].mean() , on=name)[signal]
            MAPE = computePerf(cycle_frame[signal] , cycle_frame[name + "_enc"])
            perfs[name] = [MAPE]
            MAPEDict[name] = MAPE
    
    best = min(MAPEDict, key=MAPEDict.get)
    dframe[signal + '_bestCycle'] = cycle_frame[best + '_enc']
    dframe[signal + '_bestCycle_Residue'] = cycle_frame[signal] - cycle_frame[best + '_enc']
    
    return dframe, cycle_frame, perfs.T, best

In [ ]:
cycles.groupby(['Cycle_24'])['Trend_residue'].mean().to_dict()

In [ ]:
dframe = pd.DataFrame()
#dframe['Signal'] = np.arange(1,1000) % 32
ncycles = 35
df2 , cycles, perfs, best = generate_cycles(df, 'Trend_residue', ncycles)
#perfs.sort_values(axis=1)
MAPE_CYCLE = computePerf(df['Trend_residue'] , df['Trend_residue_bestCycle_Residue'])

MAPE_CYCLE
perfs

In [ ]:
best

In [ ]:
df.plot.line('time' , ['Trend_residue', 'Trend_residue_bestCycle' , 'Trend_residue_bestCycle_Residue', ] , figsize=[8, 8])

In [ ]:
df.plot.line('time' , ['Trend_residue_bestCycle_Residue', ] , figsize=[8, 8])

Autoregressive Modeling


In [ ]:
#df1 = pd.DataFrame()
#df1['Signal'] = range(100)

def generateLags(dframe , series, P):
    mean1 = dframe[series].mean()
    for i in range(P):
        dframe[series+'_Lag' + str(i)] = dframe[series].shift(i)
        #dframe[series+'_Lag' + str(i)].fillna(mean1)
    dframe.fillna(mean1 , inplace=True)
        

generateLags(df , 'Trend_residue_bestCycle_Residue' , 36)

In [ ]:


In [ ]:


In [ ]:
df.describe()

In [ ]:
df.head(12)

In [ ]:


In [ ]:
ridge_AR = linear_model.Ridge()
inputs = df.drop(['AirPassengers' , 'Trend',  'Trend', 'real_time',  'Trend_residue', 'Trend_residue_bestCycle', 'Trend_residue_bestCycle_Residue']  , axis=1).values
target = df[['Trend_residue_bestCycle_Residue']].values
ridge_AR.fit(inputs, target)

AR_pred = ridge_AR.predict(inputs)

In [ ]:
AR_pred.shape

In [ ]:
df[['Trend_residue_bestCycle_Residue']].shape

In [ ]:
ridge_AR.score(inputs, target)

In [ ]:
df['AR'] = AR_pred
df['AR_residue'] = df['Trend_residue_bestCycle_Residue'].values - df['AR']

MAPE_AR = computePerf(df['Trend_residue_bestCycle_Residue'], df['AR'])

MAPE_AR

In [ ]:
df.head()

In [ ]:
df.plot.line('Trend_residue_bestCycle_Residue' , 'AR' , figsize=[8, 8])

In [ ]:
#df.plot?

In [ ]:
df['Forecast'] = df['Trend'] + df['Trend_residue_bestCycle'] + df['AR']

In [ ]:
df.head()

In [ ]:
df.plot.line('time' , ['AirPassengers', 'Forecast', 'AR_residue'] , figsize=[8, 8])

In [ ]:
df.plot.line('time' , ['AR_residue'] , figsize=[8, 8])

In [ ]:


In [ ]:
MAPE_MODEL = computePerf(df['AirPassengers'], df['Forecast'])

MAPE_MODEL

In [ ]: