In [ ]:
# Import the library we need, which is Pandas and Matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
# Set some parameters to get good visuals - style to ggplot and size to 15,10
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)
In [ ]:
# Read the csv file of Monthwise Quantity and Price csv file we have.
df = pd.read_csv('MonthWiseMarketArrivals_clean.csv')
In [ ]:
# Changing the date column to a Time Interval columnn
df.date = pd.DatetimeIndex(df.date)
In [ ]:
# Change the index to the date column
df.index = pd.PeriodIndex(df.date, freq='M')
In [ ]:
# Sort the data frame by date
df = df.sort_values(by = "date")
In [ ]:
df.head()
In [ ]:
dfBang = df[df.city == 'BANGALORE']
In [ ]:
dfBang.head()
In [ ]:
dfBang.plot(kind = "scatter", x = "quantity", y = "priceMod", s = 100)
In [ ]:
dfBang.plot(kind = "scatter", x = "quantity", y = "priceMod", s = 100, alpha = 0.7, xlim = [0,2000000])
In [ ]:
dfBang.corr()
In [ ]:
pd.set_option('precision', 2)
In [ ]:
dfBang.corr()
In [ ]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(dfBang, figsize=(15, 15), diagonal='kde', s = 50)
In [ ]:
import statsmodels.api as sm
In [ ]:
x = dfBang.quantity
y = dfBang.priceMod
lm = sm.OLS(y, x).fit()
In [ ]:
lm.summary()
In [ ]:
# Import seaborn library for more funcitionality
import seaborn as sns
In [ ]:
# We can try and fit a linear line to the data to see if there is a relaltionship
sns.regplot(x="quantity", y="priceMod", data=dfBang);
In [ ]:
sns.jointplot(x="quantity", y="priceMod", data=dfBang, kind="reg");
However, we have our data at constant time intervals of every month. Therefore we can analyze this data to determine the long term trend so as to forecast the future or perform some other form of analysis.
In [ ]:
# Set some parameters to get good visuals - style to ggplot and size to 15,10
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)
In [ ]:
dfBang.index = pd.DatetimeIndex(dfBang.date)
In [ ]:
dfBang.head()
In [ ]:
# Let us create a time series variable for priceMin
ts = dfBang.priceMin
In [ ]:
ts.plot()
In [ ]:
# We take the log transform to reduce the impact of high values
ts_log = np.log(ts)
In [ ]:
ts_log.plot()
In [ ]:
# One approach to remove the trend and seasonality impact is to take the difference between each observation
ts_log_diff = ts_log - ts_log.shift()
In [ ]:
ts_log_diff.plot()
ts_log.plot()
In [ ]:
# For smoothing the values we can use
# 12 month Moving Averages
ts_log_diff_ma = pd.rolling_mean(ts_log_diff, window = 12)
In [ ]:
# Simple Exponential Smoothing
ts_log_diff_exp = pd.ewma(ts_log_diff, halflife=24)
In [ ]:
ts_log_diff_ma.plot()
ts_log_diff_exp.plot()
ts_log_diff.plot()
Now we can fit an ARIMA model on this (Explaining ARIMA is out of scope of this workshop)
In [ ]:
from statsmodels.tsa.arima_model import ARIMA
In [ ]:
model = ARIMA(ts_log, order=(0, 1, 2))
In [ ]:
results_MA = model.fit(disp=-1)
plt.plot(ts_log_diff)
plt.plot(results_MA.fittedvalues, color='blue')
In [ ]: