In [45]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller
%matplotlib inline
In [26]:
data = pd.read_csv('dataset/month-wise-market-arrivals', delimiter='\t')
data.columns = ['market', 'month', 'year', 'quantity', 'priceMin', 'priceMax', 'priceMod']
data['date'] = data['month'] + '-' + data['year'].map(str)
data.date = pd.DatetimeIndex(data.date)
data.index = pd.PeriodIndex(data.date, freq='M')
In [27]:
data.head()
Out[27]:
In [28]:
data = data.sort_values(by = "date")
data.head()
Out[28]:
In [30]:
data = data.drop(["market", "month", "year", "priceMin", "priceMax"], axis = 1)
data.head()
Out[30]:
In [33]:
data.priceMod.plot()
Out[33]:
In [35]:
data['priceModLog'] = np.log(data.priceMod)
data.head()
Out[35]:
In [36]:
data.priceModLog.plot()
Out[36]:
In [37]:
model_mean_pred = data.priceModLog.mean()
data["priceMean"] = np.exp(model_mean_pred)
data.plot(kind="line", x="date", y = ["priceMod", "priceMean"])
Out[37]:
In [40]:
def RMSE(predicted, actual):
mse = (predicted - actual)**2
rmse = np.sqrt(mse.sum()/mse.count())
return rmse
print('model_mean_RMSE : {}'.format(RMSE(data.priceMean, data.priceMod)))
In [41]:
data["timeIndex"] = data.date - data.date.min()
data.head()
Out[41]:
In [42]:
# Convert to months using the timedelta function
data["timeIndex"] = data["timeIndex"]/np.timedelta64(1, 'M')
# Round the number to 0
data["timeIndex"] = data["timeIndex"].round(0).astype(int)
data.head()
Out[42]:
In [47]:
# Plot linear regression between priceMod and timeIndex
model_linear = smf.ols('priceModLog ~ timeIndex', data = data).fit()
model_linear.summary()
Out[47]:
In [49]:
model_linear_pred = model_linear.predict()
data["priceLinear"] = np.exp(model_linear_pred)
data.plot(kind="line", x="date", y = ["priceMod", "priceLinear"])
Out[49]:
In [50]:
print('model_linear_RMSE : {}'.format(RMSE(data.priceLinear, data.priceMod)))
In [52]:
data["priceModLogShift1"] = data.priceModLog.shift()
data.head()
Out[52]:
In [56]:
data["priceRandom"] = np.exp(data.priceModLogShift1)
data["priceModLogDiff"] = data.priceModLog - data.priceModLogShift1
data.plot(kind="line", x="timeIndex", y = ["priceMod","priceRandom"])
Out[56]:
In [54]:
print('model_random_RMSE : {}'.format(RMSE(data.priceRandom, data.priceMod)))
In [64]:
from statsmodels.tsa.seasonal import seasonal_decompose
data.index = data.index.to_datetime()
decomposition = seasonal_decompose(data.priceModLog, model = "additive")
decomposition.plot()
Out[64]:
In [65]:
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
data["priceDecomp"] = np.exp(trend + seasonal)
data.plot(kind="line", x="timeIndex", y = ["priceMod", "priceDecomp"])
Out[65]:
In [67]:
print('model_decomp_RMSE : {}'.format(RMSE(data.priceDecomp, data.priceMod)))
In [57]:
ts = data.priceModLog
ts_diff = data.priceModLogDiff
ts_diff.dropna(inplace = True)
In [58]:
from statsmodels.tsa.arima_model import ARIMA
model_AR1MA = ARIMA(ts_diff, order=(1,0,1))
results_ARIMA = model_AR1MA.fit(disp = -1)
ts_diff.plot()
results_ARIMA.fittedvalues.plot()
Out[58]:
In [59]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_log = pd.Series(ts.ix[0], index=ts.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fprint('model_arima_RMSE : {}'.format(RMSE(data.priceARIMA, data.priceMod)))ill_value=0)
data['priceARIMA'] = np.exp(predictions_ARIMA_log)
data.plot(kind="line", x="timeIndex", y = ["priceMod", "priceARIMA"])
Out[59]:
In [66]:
print('model_arima_RMSE : {}'.format(RMSE(data.priceARIMA, data.priceMod)))
In [61]:
data.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", "priceRandom",
"priceARIMA"])
Out[61]: