Project: Predicting Bike Sharing

Requirements

py3.5
sklearn-0.18
tensorflow-1.1.0
...

Exploring the Data


In [1]:
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

data_path = 'Bike-Sharing-Dataset/hour.csv'
rides = pd.read_csv(data_path)

print("Bike Sharing dataset has {} data points with {} variables each.".format(*rides.shape))
display(rides.head(n=1))


Bike Sharing dataset has 17379 data points with 17 variables each.
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 1 0 1 0 0 6 0 1 0.24 0.2879 0.81 0.0 3 13 16

In [2]:
display(rides.describe())


instant season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
count 17379.0000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000
mean 8690.0000 2.501640 0.502561 6.537775 11.546752 0.028770 3.003683 0.682721 1.425283 0.496987 0.475775 0.627229 0.190098 35.676218 153.786869 189.463088
std 5017.0295 1.106918 0.500008 3.438776 6.914405 0.167165 2.005771 0.465431 0.639357 0.192556 0.171850 0.192930 0.122340 49.305030 151.357286 181.387599
min 1.0000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.020000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 4345.5000 2.000000 0.000000 4.000000 6.000000 0.000000 1.000000 0.000000 1.000000 0.340000 0.333300 0.480000 0.104500 4.000000 34.000000 40.000000
50% 8690.0000 3.000000 1.000000 7.000000 12.000000 0.000000 3.000000 1.000000 1.000000 0.500000 0.484800 0.630000 0.194000 17.000000 115.000000 142.000000
75% 13034.5000 3.000000 1.000000 10.000000 18.000000 0.000000 5.000000 1.000000 2.000000 0.660000 0.621200 0.780000 0.253700 48.000000 220.000000 281.000000
max 17379.0000 4.000000 1.000000 12.000000 23.000000 1.000000 6.000000 1.000000 4.000000 1.000000 1.000000 1.000000 0.850700 367.000000 886.000000 977.000000

Visualize Feature Distributions


In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

# Produce a scatter matrix for each pair of features in the data
pd.plotting.scatter_matrix(rides, alpha = 0.3, figsize = (14,8), diagonal = 'kde');


Data preprocessing

One-hot encoding


In [4]:
dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday']
for each in dummy_fields:
    dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)
    rides = pd.concat([rides, dummies], axis=1)

fields_to_drop = ['instant', 'dteday', 'season', 'weathersit', 
                  'weekday', 'atemp', 'mnth', 'workingday', 'hr']
data = rides.drop(fields_to_drop, axis=1)
data.head()


Out[4]:
yr holiday temp hum windspeed casual registered cnt season_1 season_2 ... hr_21 hr_22 hr_23 weekday_0 weekday_1 weekday_2 weekday_3 weekday_4 weekday_5 weekday_6
0 0 0 0.24 0.81 0.0 3 13 16 1 0 ... 0 0 0 0 0 0 0 0 0 1
1 0 0 0.22 0.80 0.0 8 32 40 1 0 ... 0 0 0 0 0 0 0 0 0 1
2 0 0 0.22 0.80 0.0 5 27 32 1 0 ... 0 0 0 0 0 0 0 0 0 1
3 0 0 0.24 0.75 0.0 3 10 13 1 0 ... 0 0 0 0 0 0 0 0 0 1
4 0 0 0.24 0.75 0.0 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 59 columns

Scaling target variables


In [5]:
quant_features = ['casual', 'registered', 'cnt', 'temp', 'hum', 'windspeed']
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for each in quant_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

Splitting the data into training, testing, and validation sets


In [6]:
# Save data for approximately the last 21 days 
test_data = data[-21*24:]

# Now remove the test data from the data set 
data = data[:-21*24]

# Separate the data into features and targets
target_fields = ['cnt', 'casual', 'registered']
features, targets = data.drop(target_fields, axis=1), data[target_fields]
test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]

In [7]:
# Hold out the last 60 days or so of the remaining data as a validation set
train_features, train_targets = features[:-60*24], targets[:-60*24]
val_features, val_targets = features[-60*24:], targets[-60*24:]

Non NN approach

RandomForestRegressor + GridSearchCV


In [8]:
from sklearn.cross_validation import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(train_features, train_targets, 
                                                    test_size=0.2, train_size=0.8, random_state=50)

# Success
print("Training and testing split was successful.")


Training and testing split was successful.
E:\work\Anaconda3\envs\dlnd\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [9]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict, multioutput='variance_weighted')
    
    # Return the score
    return score

In [10]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

def fit_model(X, y):
    
    cv_sets = KFold(n_splits = 10, shuffle=False, random_state = 25)
    regressor = RandomForestRegressor(random_state = 0)
    params = {'n_estimators':[10, 20, 40, 80], 'min_samples_split':[2, 4, 8, 16]}
    
    scoring_fnc = make_scorer(performance_metric)
    
    grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
    grid = grid.fit(X,y)
    
    print(grid.best_score_)
    return grid.best_estimator_

In [12]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

# Produce the value for optimized parameters
print("Parameters are {} for the optimal model.".format(reg.get_params()))


0.888103783867
Parameters are {'min_weight_fraction_leaf': 0.0, 'max_features': 'auto', 'random_state': 0, 'n_estimators': 80, 'verbose': 0, 'min_samples_split': 2, 'max_depth': None, 'min_impurity_split': 1e-07, 'oob_score': False, 'min_samples_leaf': 1, 'warm_start': False, 'criterion': 'mse', 'max_leaf_nodes': None, 'bootstrap': True, 'n_jobs': 1} for the optimal model.

Checkpoint


In [14]:
reg = RandomForestRegressor(n_estimators=80, min_samples_split=2, random_state=0).fit(X_train, y_train)

In [15]:
fig, ax = plt.subplots(figsize=(8,4))

mean, std = scaled_features['cnt']
predictions = reg.predict(test_features).T*std + mean
ax.plot(predictions[0], label='Prediction')
ax.plot((test_targets['cnt']*std + mean).values, label='Data')
ax.set_xlim(right=len(predictions))
ax.legend()

dates = pd.to_datetime(rides.ix[test_data.index]['dteday'])
dates = dates.apply(lambda d: d.strftime('%b %d'))
ax.set_xticks(np.arange(len(dates))[12::24])
_ = ax.set_xticklabels(dates[12::24], rotation=45)


Neural Network approach using Tensorflow


In [16]:
import tensorflow as tf

In [17]:
# Remove previous weights, bias, inputs, etc..
tf.reset_default_graph()

In [28]:
epochs = 500
learning_rate = 0.1
hidden_nodes = 28
output_nodes = 3
batch_size = 512
display_step = 100

total_len = X_train.shape[0]
n_input = X_train.shape[1]

x = tf.placeholder(shape=[None, n_input], dtype=tf.float32, name='x')
y = tf.placeholder(shape=[None, output_nodes], dtype=tf.float32, name='y')

In [29]:
def neural_network(x_tensor, weights, biases):
 
    x = tf.add(tf.matmul(x_tensor, weights['h1']), biases['h1'])
    x = tf.nn.sigmoid(x)
    
    x = tf.add(tf.matmul(x, weights['out']), biases['out'])
    
    return x

In [30]:
weights = {
    'h1': tf.Variable(tf.truncated_normal([n_input, hidden_nodes], stddev=0.1)),
    'out': tf.Variable(tf.truncated_normal([hidden_nodes, output_nodes], stddev=0.1))
}

biases = {
    'h1': tf.Variable(tf.zeros(hidden_nodes)),
    'out': tf.Variable(tf.zeros(output_nodes))
}

In [31]:
# Construct model
pred = neural_network(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.square(pred-y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [32]:
### Launch the graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Training cycle
    for epoch in range(epochs):
        
        avg_cost = 0.
        total_batch = int(total_len/batch_size)
        # Loop over all batches
        for i in range(total_batch-1):
            batch_x = X_train[i*batch_size:(i+1)*batch_size]
            batch_y = y_train[i*batch_size:(i+1)*batch_size]
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c, p = sess.run([optimizer, cost, pred], feed_dict={x: batch_x,
                                                          y: batch_y})
            # Compute average loss
            avg_cost += c / total_batch

        # sample prediction
        label_value = batch_y
        estimate = p
        err = label_value-estimate

        # Display logs per epoch step
        if epoch % display_step == 0:
            print ("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))

    print ("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print ("Accuracy:", accuracy.eval({x: X_test, y: y_test}))
    
    predictions = sess.run([pred],{x: test_features})


Epoch: 0001 cost= 0.713052006
Epoch: 0101 cost= 0.048547885
Epoch: 0201 cost= 0.046817406
Epoch: 0301 cost= 0.046626318
Epoch: 0401 cost= 0.046551026
Optimization Finished!
Accuracy: 0.86265

In [34]:
fig, ax = plt.subplots(figsize=(8,4))

mean, std = scaled_features['cnt']
prediction = predictions[0].T*std + mean
ax.plot(prediction[0], label='Prediction')
ax.plot((test_targets['cnt']*std + mean).values, label='Data')
ax.set_xlim(right=len(prediction))
ax.legend()

dates = pd.to_datetime(rides.ix[test_data.index]['dteday'])
dates = dates.apply(lambda d: d.strftime('%b %d'))
ax.set_xticks(np.arange(len(dates))[12::24])
_ = ax.set_xticklabels(dates[12::24], rotation=45)



In [ ]: