In [ ]:
import pandas as pd
import numpy as np
In [ ]:
def load_airline_passengers() :
trainfile = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/AirPassengers.csv"
cols = ["ID" , "time", "AirPassengers"];
df_train = pd.read_csv(trainfile, names = cols, sep=r',', index_col='ID', engine='python', skiprows=1);
df_train.to_csv("passengers_train.csv");
return df_train
In [ ]:
df = load_airline_passengers()
In [ ]:
In [ ]:
df.head()
In [ ]:
from datetime import datetime, date, time
min_val = int(df.time.min())
first_date = date(min_val, 1 , 1)
def to_date_passengers(x):
return date(int(x) , int(12 * (x - int(x) + 0.01)) + 1 , 1)
df['real_time'] = df['time'].apply(to_date_passengers)
In [ ]:
#pd.read_csv?
In [ ]:
#df['AirPassengers'] = np.log(df['AirPassengers'])
In [ ]:
%matplotlib inline
In [ ]:
first_date
In [ ]:
#df.plot?
In [ ]:
df.plot.line('real_time' , 'AirPassengers')
In [ ]:
def computePerf(signal , estimator):
MAPE = np.mean(abs((signal - estimator) / signal))
return MAPE
In [ ]:
import sklearn as skl
import sklearn.preprocessing as preprocessing
import sklearn.linear_model as linear_model
ridge = linear_model.Ridge()
ridge.fit(df[['time']].values, df[['AirPassengers']].values)
Y_pred = ridge.predict(df[['time']])
In [ ]:
ridge.score(df[['time']].values, df[['AirPassengers']].values)
In [ ]:
df['Trend'] = Y_pred
df['Trend_residue'] = df[['AirPassengers']].values - Y_pred
In [ ]:
MAPE_Trend = computePerf(df['AirPassengers'] , df['Trend'])
MAPE_Trend
In [ ]:
df[['Trend_residue']].describe()
In [ ]:
df.plot.line('real_time' , ['AirPassengers' , 'Trend' , 'Trend_residue'])
In [ ]:
df.describe()
In [ ]:
df.head(12)
In [ ]:
def generate_cycles(dframe, signal, K):
perfs = pd.DataFrame()
cycle_frame = pd.DataFrame()
cycle_frame[signal] = dframe[signal]
MAPEDict = {}
for i in range(K):
if i > 1:
name = 'Cycle_' + str(i)
cycle_frame[name] = np.arange(0,cycle_frame.shape[0]) % i
cycle_frame[name + '_enc'] = cycle_frame[[name]].join(cycle_frame.groupby([name])[signal].mean() , on=name)[signal]
MAPE = computePerf(cycle_frame[signal] , cycle_frame[name + "_enc"])
perfs[name] = [MAPE]
MAPEDict[name] = MAPE
best = min(MAPEDict, key=MAPEDict.get)
dframe[signal + '_bestCycle'] = cycle_frame[best + '_enc']
dframe[signal + '_bestCycle_Residue'] = cycle_frame[signal] - cycle_frame[best + '_enc']
return dframe, cycle_frame, perfs.T, best
In [ ]:
cycles.groupby(['Cycle_24'])['Trend_residue'].mean().to_dict()
In [ ]:
dframe = pd.DataFrame()
#dframe['Signal'] = np.arange(1,1000) % 32
ncycles = 35
df2 , cycles, perfs, best = generate_cycles(df, 'Trend_residue', ncycles)
#perfs.sort_values(axis=1)
MAPE_CYCLE = computePerf(df['Trend_residue'] , df['Trend_residue_bestCycle_Residue'])
MAPE_CYCLE
perfs
In [ ]:
best
In [ ]:
df.plot.line('time' , ['Trend_residue', 'Trend_residue_bestCycle' , 'Trend_residue_bestCycle_Residue', ] , figsize=[8, 8])
In [ ]:
df.plot.line('time' , ['Trend_residue_bestCycle_Residue', ] , figsize=[8, 8])
In [ ]:
#df1 = pd.DataFrame()
#df1['Signal'] = range(100)
def generateLags(dframe , series, P):
mean1 = dframe[series].mean()
for i in range(P):
dframe[series+'_Lag' + str(i)] = dframe[series].shift(i)
#dframe[series+'_Lag' + str(i)].fillna(mean1)
dframe.fillna(mean1 , inplace=True)
generateLags(df , 'Trend_residue_bestCycle_Residue' , 36)
In [ ]:
In [ ]:
In [ ]:
df.describe()
In [ ]:
df.head(12)
In [ ]:
In [ ]:
ridge_AR = linear_model.Ridge()
inputs = df.drop(['AirPassengers' , 'Trend', 'Trend', 'real_time', 'Trend_residue', 'Trend_residue_bestCycle', 'Trend_residue_bestCycle_Residue'] , axis=1).values
target = df[['Trend_residue_bestCycle_Residue']].values
ridge_AR.fit(inputs, target)
AR_pred = ridge_AR.predict(inputs)
In [ ]:
AR_pred.shape
In [ ]:
df[['Trend_residue_bestCycle_Residue']].shape
In [ ]:
ridge_AR.score(inputs, target)
In [ ]:
df['AR'] = AR_pred
df['AR_residue'] = df['Trend_residue_bestCycle_Residue'].values - df['AR']
MAPE_AR = computePerf(df['Trend_residue_bestCycle_Residue'], df['AR'])
MAPE_AR
In [ ]:
df.head()
In [ ]:
df.plot.line('Trend_residue_bestCycle_Residue' , 'AR' , figsize=[8, 8])
In [ ]:
#df.plot?
In [ ]:
df['Forecast'] = df['Trend'] + df['Trend_residue_bestCycle'] + df['AR']
In [ ]:
df.head()
In [ ]:
df.plot.line('time' , ['AirPassengers', 'Forecast', 'AR_residue'] , figsize=[8, 8])
In [ ]:
df.plot.line('time' , ['AR_residue'] , figsize=[8, 8])
In [ ]:
In [ ]:
MAPE_MODEL = computePerf(df['AirPassengers'], df['Forecast'])
MAPE_MODEL
In [ ]: