Another linear regression algorithm is ordinary least squares, which is guaranteed to find optimal solution. Here I'm using the same features [hour, weekday, holiday, conds, UNIT] and [weekday, day_week, holiday, rain, fog, current, mean, holiday, conds, UNIT, datetime] from section 2.1 and try gain with OLS which is built into StatsModel library. I'm also plotting to see the comparison of predicted and observed values, and the quantiles of x versus the quantiles/ppf of a distribution. From the plots below we can say, basically for this case, both OLS and gradient descent are generating very similar results.
In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
%matplotlib inline
df = pd.read_csv('turnstile_weather_v2.csv', parse_dates=['datetime'])
df_entries_hourly = df['ENTRIESn_hourly']
In [8]:
def calc_results(features, values):
#add a column of 1s for y intercept
features = sm.add_constant(features)
model = sm.OLS(values, features)
results = model.fit()
return results
In [9]:
'''
Draw a plot to compare the true relationship to OLS predictions.
'''
def plot_OLS_comparison(results, sample_size):
# Confidence intervals around the predictions are built using the wls_prediction_std command.
prstd, iv_l, iv_u = wls_prediction_std(results)
fig, ax = plt.subplots(figsize=(8,6))
x_arange = np.arange(0, sample_size, 1)
ax.plot(x_arange, df_entries_hourly[:sample_size], 'b', label="data")
ax.plot(x_arange, results.fittedvalues[:sample_size], 'g', label="OLS")
ax.plot(x_arange, iv_u[:sample_size], 'r--')
ax.plot(x_arange, iv_l[:sample_size], 'r--')
ax.set_xlabel('First %i values' % sample_size)
ax.set_ylabel('Hourly etries')
ax.set_title('Compare observed and predicted value samples')
ax.legend(loc='best');
In [10]:
def qqplot_residuals(residuals):
fig = sm.qqplot(residuals, line='s')
plt.title('Compare residuals for normal distribution')
plt.show()
In [11]:
features = df[['hour', 'weekday']]
# including holiday as a feature, 05-30-11 is Memorial Day
features['holiday'] = np.where(df['DATEn'] == '05-30-11', 1, 0)
# Add conds to features using dummy variables
dummies = pd.get_dummies(df['conds'], prefix='conds')
features = features.join(dummies)
# Add UNIT to features using dummy variables
dummies = pd.get_dummies(df['UNIT'], prefix='unit')
features = features.join(dummies)
results = calc_results(features, df_entries_hourly)
#Due to dummy data, I'm commenting this out. Feel free to try it yourself.
#print results.summary()
print('Features: [hour, weekday, holiday, conds, UNIT] R^2: %.5f' % results.rsquared)
plot_OLS_comparison(results, 100)
qqplot_residuals(results.resid)
In [12]:
features = df[['weekday', 'day_week', 'rain', 'fog', 'meanprecipi', 'meanpressurei', 'meantempi', 'meanwspdi', 'precipi', 'pressurei', 'tempi', 'wspdi']]
features['holiday'] = np.where(df['DATEn'] == '05-30-11', 1, 0)
# Add conds to features using dummy variables
dummies = pd.get_dummies(df['conds'], prefix='conds')
features = features.join(dummies)
# Add UNIT to features using dummy variables
dummies = pd.get_dummies(df['UNIT'], prefix='unit')
features = features.join(dummies)
# Add datetime to features using dummy variables
dummies = pd.get_dummies(df['datetime'], prefix='datetime')
features = features.join(dummies)
results = calc_results(features, df_entries_hourly)
print('Features: [many] R^2: %.5f' % results.rsquared)
plot_OLS_comparison(results, 100)
qqplot_residuals(results.resid)