Now on to the real challenge, the hourly data. So, as usual, make some plots, do a bit of anaalysis and try to get a feel for the data.
In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [14]:
hourly = pd.read_csv('Bike-Sharing-Dataset/hour.csv',header = 0)
In [15]:
hourly.head(20)
Out[15]:
In [16]:
type(hourly)
for row_index, row in hourly.iterrows():
print row_index , row['registered']
if (row_index > 5):
break
In [17]:
weekly = np.zeros((24,7))
for row_index, row in hourly.iterrows():
weekly[row['hr'], row['weekday']] += row['registered']
In [18]:
print np.max(weekly)
In [19]:
plt.imshow(np.sqrt(weekly/np.max(weekly)), interpolation='nearest', aspect=0.5)
Out[19]:
In [20]:
weekly = np.zeros((24,7))
for row_index, row in hourly.iterrows():
weekly[row['hr'], row['weekday']] += row['casual']
plt.imshow(np.sqrt(weekly/np.max(weekly)), interpolation='nearest', aspect=0.5)
Out[20]:
In [21]:
weekly = np.zeros((24,7))
for row_index, row in hourly.iterrows():
weekly[row['hr'], row['weekday']] += row['cnt']
plt.imshow(np.sqrt(weekly/np.max(weekly)), interpolation='nearest', aspect=0.5)
Out[21]:
I already knew this from analyzing the daily data, but this shows that registered riders tend to commute, while casual riders joy ride on the weekends. And, if you look closely, you can see that registered users tend to head out of work a bit earlier on fridays. To get the best model, we'll need to include weekday (which we may want to put through one-hot encoder first).
I want to use the training/test cut that Kaggle does, so need to put the day of the month into the column, and then drop the unnecessary columns
In [22]:
hourly['day'] = pd.DatetimeIndex(hourly.dteday).day
hourly = hourly.drop(['instant','dteday','casual','registered'], axis = 1)
In [83]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
dumb = LabelEncoder()
dumb1 = dumb.fit_transform(hourly['weekday'])
hourly[['weekday','day']]
Out[83]:
In [92]:
#try one hot encoding
#>>> from sklearn.preprocessing import OneHotEncoder
# >>> enc = OneHotEncoder()
# >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
#[1, 0, 2]]) # doctest: +ELLIPSIS
# OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
# handle_unknown='error', n_values='auto', sparse=True)
# >>> enc.n_values_
# array([2, 3, 4])
# >>> enc.feature_indices_
# array([0, 2, 5, 9])
# >>> enc.transform([[0, 1, 1]]).toarray()
# array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
dumb = preprocessing.LabelEncoder()
dumb1 = dumb.fit_transform(hourly['weekday'])
#to convert back
#train.Sex = le_sex.inverse_transform(train.Sex)
enc = OneHotEncoder()
#enc.categorical_features = [['weekday','day']]
#enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1],[1, 0, 2]])
#print enc.transform([[0,1,2]]).toarray()
#print enc.transform([[0,1,3]]).toarray()
enc.fit(hourly[['weekday']])
dumb = enc.transform(hourly[['weekday']])
print dumb[0:100].toarray()
In [23]:
Xtrain = hourly[hourly.day < 19].drop('cnt',axis=1).values #the data for the training set
ytrain = (hourly[hourly.day < 19])['cnt'].values #the target of the training set
Xtest = hourly[hourly.day >= 19].drop('cnt',axis=1).values #the data for the test set
ytest = (hourly[hourly.day >= 19])['cnt'].values #the target of the test set
In [24]:
print ytrain.shape
print Xtrain.shape
print ytest.shape
print Xtest.shape
print Xtest[0]
In [25]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
In [26]:
cv = cross_validation.ShuffleSplit(len(Xtrain), n_iter=3, test_size=0.2,
random_state=0)
for train, test in cv:
reg = RandomForestRegressor(n_estimators = 500).fit(Xtrain[train], ytrain[train])
print reg.score(Xtrain[train], ytrain[train]), reg.score(Xtrain[test], ytrain[test])
print reg.score(Xtest,ytest)
In [28]:
estimators = [10,100,500]
grid = GridSearchCV(estimator=reg, param_grid=dict(n_estimators=estimators), n_jobs=-1)
grid.fit(Xtrain,ytrain)
print grid.best_score_
print grid.best_estimator_.n_estimators
In [29]:
print grid.grid_scores_
RandomForestRegressor does a much better job than the other estimators. Not seeing much difference when I change n_estimators, and the mean score is much lower than I'd expect (ie, it doesn't match the cv scores above). I wonder if I did this right ....
In [30]:
pred = reg.predict(Xtest) #put the predicted values into an array
hrInd=3 #the column number of the hr column
weekdayInd = 5
weeklyPredict = np.zeros((24,7))
weeklyActual = np.zeros((24,7))
for i in range(0,len(ytest)):
weeklyPredict[Xtest[i,hrInd], Xtest[i,weekdayInd]] += pred[i]
weeklyActual[Xtest[i,hrInd], Xtest[i,weekdayInd]] += ytest[i]
def makeDifferencePlot(weeklyPredict, weeklyActual):
plt.figure(1, figsize=(12,6))
plt.subplot(141)
plt.imshow(np.sqrt(weeklyPredict), interpolation='nearest', aspect=0.5)
plt.subplot(142)
plt.imshow(np.sqrt(weeklyActual), interpolation='nearest', aspect=0.5)
plt.subplot(143)
plt.imshow((weeklyPredict-weeklyActual), interpolation='nearest', aspect=0.5)
plt.subplot(144)
plt.imshow((weeklyPredict/weeklyActual), interpolation='nearest', aspect=0.5)
plt.show()
makeDifferencePlot(weeklyPredict, weeklyActual)
This looks reasonable. Obviously not perfect, and the difference plot has some structure. The quotient is reasonably flat, though. This is also for the entire dataset - there may be more pronounced differences under certain conditions (ie, weather, temp, season, etc).
take a look at the same plots when the weather is good and the temp is high
In [31]:
weeklyPredict = np.zeros((24,7))
weeklyActual = np.zeros((24,7))
weatherInd=7
atempInd = 9
for i in range(0,len(ytest)):
if (Xtest[i, weatherInd] < 3 and Xtest[i,atempInd] > .6):
weeklyPredict[Xtest[i,hrInd], Xtest[i,weekdayInd]] += pred[i]
weeklyActual[Xtest[i,hrInd], Xtest[i,weekdayInd]] += ytest[i]
makeDifferencePlot(weeklyPredict, weeklyActual)
In [32]:
weeklyPredict = np.zeros((24,7))
weeklyActual = np.zeros((24,7))
weatherInd=7
atempInd = 9
for i in range(0,len(ytest)):
if (Xtest[i, weatherInd] > 2 and Xtest[i,atempInd] < .6):
weeklyPredict[Xtest[i,hrInd], Xtest[i,weekdayInd]] += pred[i]
weeklyActual[Xtest[i,hrInd], Xtest[i,weekdayInd]] += ytest[i]
makeDifferencePlot(weeklyPredict, weeklyActual)
way overpredicting saturday rides in bad weather.
In [34]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [38]:
from sklearn.learning_curve import learning_curve
title = "Learning Curves (Random forest)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(len(Xtrain), n_iter=30,
test_size=0.3, random_state=0)
reg = RandomForestRegressor(n_estimators = 100)
plot_learning_curve(reg, title, Xtrain, ytrain, ylim=(0.6, 1.01), cv=cv, n_jobs=1)
Out[38]:
In [37]:
plt.show()
this plot can be interpreted as needing more training data - it can also be interpreted as needing fewer features. That is to say, it does very well on the training set, but less so on the test set. So it's overfitting/has high variance.
In [ ]: