Data Description
In [2]:
%matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
In [3]:
"""
ESTIMATING THE COEFFICIENTS
"""
# Dataset: How many people rent capitol bikeshare bikes
bike_dat = pd.read_csv("Data/day.csv")
bike_dat.head(2)
Out[3]:
In [4]:
# Plot the data in a scatter plot
plt.scatter(bike_dat.atemp, bike_dat.cnt, alpha=0.3) # Plot the raw data
Out[4]:
In [5]:
# Estimate the model parameters
est_s = smf.ols(formula='cnt ~ temp', data=bike_dat).fit()
# View the model estimates
est_s.summary()
Out[5]:
In [6]:
# Plot the data with scatter plot
plt.scatter(bike_dat.temp, bike_dat.cnt, alpha=0.3)
plt.xlabel("Temperature")
plt.ylabel("Number of Bike Rentals")
Out[6]:
In [7]:
# Generate coefficient estimates
plt.plot([bike_dat.temp.min(), bike_dat.temp.max()],
[est_s.params.Intercept + est_s.params.temp * bike_dat.temp.min(),
est_s.params.Intercept + est_s.params.temp * bike_dat.temp.max()],
linewidth=2)
Out[7]:
In [8]:
# Generate data points
x_prime = pd.DataFrame({'temp' : np.linspace(bike_dat.temp.min(),
bike_dat.temp.max(), 100)})
In [9]:
# Generate the predictions using the built in method
y_hat = est_s.predict(x_prime)
In [10]:
# Plot the data
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)
Out[10]:
In [11]:
# Now let's run a multiple linear regression
# The temp variable is no longer significant. Why? Multicollinearity
est_m = smf.ols(formula='cnt ~ atemp + temp + workingday + windspeed',
data=bike_dat).fit()
est_m.summary()
Out[11]:
In [12]:
# Scatter plot (observe the (unsurprising) correlation between atemp and temp)
cols = ['cnt','atemp','windspeed','weathersit','temp','workingday','hum']
pd.scatter_matrix(bike_dat[cols])
Out[12]:
In [13]:
# Correlation coefficient matrix
corr_matrix = np.corrcoef(bike_dat[cols].T)
sm.graphics.plot_corr(corr_matrix, xnames=cols)
Out[13]:
In [14]:
# Let's say we wanted to include an interaction term
# We would do this by including the ':' between interacting variables
est_m = smf.ols(formula='cnt ~ temp + windspeed + temp:windspeed + workingday',
data=bike_dat).fit()
est_m.summary()
Out[14]:
In [15]:
# An alternate way of specifying interaction terms
# a*b is equivalent to a + b + a:b
est_m = smf.ols(formula='cnt ~ temp*windspeed + workingday',data=bike_dat).fit()
est_m.summary()
Out[15]:
In [16]:
# Dataset: How many people rent capitol bikeshare bikes
bike_hour_dat = pd.read_csv("Data/hour.csv")
bike_hour_dat.head(2)
Out[16]:
In [19]:
# Multiple regression
est_hour_m = smf.ols(formula='cnt ~ temp + hum + workingday + hr + weathersit', data=bike_hour_dat).fit()
est_hour_m.summary()
Out[19]:
In [21]:
est_hour_m.rsquared_adj
Out[21]:
In [28]:
est_m.rsquared_adj
Out[28]:
In [34]:
bike_hour_dat['rush']=0
bike_hour_dat['rush'][((bike_hour_dat['hr'] >= 6) & (bike_hour_dat['hr'] <= 9 ))|
((bike_hour_dat['hr'] >= 16) & (bike_hour_dat['hr'] <= 18 ))] = 1
In [37]:
est_hour_with_rush_m = smf.ols(formula='cnt ~ temp + hum + workingday + weathersit + rush', data=bike_hour_dat).fit()
est_hour_with_rush_m.summary()
Out[37]:
In [36]:
# print (est_hour_m.rsquared_adj)
# print (est_m.rsquared_adj)
# print (est_hour_with_rush_m.rsquared_adj)
In [47]:
print ("Day : [%0.6s]"%est_m.rsquared_adj)
print ("Hour : [%0.6s]"%est_hour_m.rsquared_adj)
print ("Hour(With Rush hour) : [%0.6s]"%est_hour_with_rush_m.rsquared_adj)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: