Requirements

py3.5
sklearn-0.18
tensorflow-1.1.0
...

Exploring the Data



In [1]:

    
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

data_path = 'Bike-Sharing-Dataset/hour.csv'
rides = pd.read_csv(data_path)

print("Bike Sharing dataset has {} data points with {} variables each.".format(*rides.shape))
display(rides.head(n=1))









    



Bike Sharing dataset has 17379 data points with 17 variables each.






    







  
    
      
      instant
      dteday
      season
      yr
      mnth
      hr
      holiday
      weekday
      workingday
      weathersit
      temp
      atemp
      hum
      windspeed
      casual
      registered
      cnt
    
  
  
    
      0
      1
      2011-01-01
      1
      0
      1
      0
      0
      6
      0
      1
      0.24
      0.2879
      0.81
      0.0
      3
      13
      16



In [2]:

    
display(rides.describe())









    







  
    
      
      instant
      season
      yr
      mnth
      hr
      holiday
      weekday
      workingday
      weathersit
      temp
      atemp
      hum
      windspeed
      casual
      registered
      cnt
    
  
  
    
      count
      17379.0000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
      17379.000000
    
    
      mean
      8690.0000
      2.501640
      0.502561
      6.537775
      11.546752
      0.028770
      3.003683
      0.682721
      1.425283
      0.496987
      0.475775
      0.627229
      0.190098
      35.676218
      153.786869
      189.463088
    
    
      std
      5017.0295
      1.106918
      0.500008
      3.438776
      6.914405
      0.167165
      2.005771
      0.465431
      0.639357
      0.192556
      0.171850
      0.192930
      0.122340
      49.305030
      151.357286
      181.387599
    
    
      min
      1.0000
      1.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.020000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
    
    
      25%
      4345.5000
      2.000000
      0.000000
      4.000000
      6.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.340000
      0.333300
      0.480000
      0.104500
      4.000000
      34.000000
      40.000000
    
    
      50%
      8690.0000
      3.000000
      1.000000
      7.000000
      12.000000
      0.000000
      3.000000
      1.000000
      1.000000
      0.500000
      0.484800
      0.630000
      0.194000
      17.000000
      115.000000
      142.000000
    
    
      75%
      13034.5000
      3.000000
      1.000000
      10.000000
      18.000000
      0.000000
      5.000000
      1.000000
      2.000000
      0.660000
      0.621200
      0.780000
      0.253700
      48.000000
      220.000000
      281.000000
    
    
      max
      17379.0000
      4.000000
      1.000000
      12.000000
      23.000000
      1.000000
      6.000000
      1.000000
      4.000000
      1.000000
      1.000000
      1.000000
      0.850700
      367.000000
      886.000000
      977.000000

Visualize Feature Distributions



In [3]:

    
import matplotlib.pyplot as plt
import seaborn as sns

# Produce a scatter matrix for each pair of features in the data
pd.plotting.scatter_matrix(rides, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

Data preprocessing

One-hot encoding



In [4]:

    
dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday']
for each in dummy_fields:
    dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)
    rides = pd.concat([rides, dummies], axis=1)

fields_to_drop = ['instant', 'dteday', 'season', 'weathersit', 
                  'weekday', 'atemp', 'mnth', 'workingday', 'hr']
data = rides.drop(fields_to_drop, axis=1)
data.head()









    Out[4]:







  
    
      
      yr
      holiday
      temp
      hum
      windspeed
      casual
      registered
      cnt
      season_1
      season_2
      ...
      hr_21
      hr_22
      hr_23
      weekday_0
      weekday_1
      weekday_2
      weekday_3
      weekday_4
      weekday_5
      weekday_6
    
  
  
    
      0
      0
      0
      0.24
      0.81
      0.0
      3
      13
      16
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1
      0
      0
      0.22
      0.80
      0.0
      8
      32
      40
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      0
      0
      0.22
      0.80
      0.0
      5
      27
      32
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      3
      0
      0
      0.24
      0.75
      0.0
      3
      10
      13
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      4
      0
      0
      0.24
      0.75
      0.0
      0
      1
      1
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 59 columns

Scaling target variables



In [5]:

    
quant_features = ['casual', 'registered', 'cnt', 'temp', 'hum', 'windspeed']
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for each in quant_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

Splitting the data into training, testing, and validation sets



In [6]:

    
# Save data for approximately the last 21 days 
test_data = data[-21*24:]

# Now remove the test data from the data set 
data = data[:-21*24]

# Separate the data into features and targets
target_fields = ['cnt', 'casual', 'registered']
features, targets = data.drop(target_fields, axis=1), data[target_fields]
test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]



In [7]:

    
# Hold out the last 60 days or so of the remaining data as a validation set
train_features, train_targets = features[:-60*24], targets[:-60*24]
val_features, val_targets = features[-60*24:], targets[-60*24:]

Non NN approach

RandomForestRegressor + GridSearchCV



In [8]:

    
from sklearn.cross_validation import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(train_features, train_targets, 
                                                    test_size=0.2, train_size=0.8, random_state=50)

# Success
print("Training and testing split was successful.")









    



Training and testing split was successful.






    



E:\work\Anaconda3\envs\dlnd\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [9]:

    
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict, multioutput='variance_weighted')
    
    # Return the score
    return score



In [10]:

    
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

def fit_model(X, y):
    
    cv_sets = KFold(n_splits = 10, shuffle=False, random_state = 25)
    regressor = RandomForestRegressor(random_state = 0)
    params = {'n_estimators':[10, 20, 40, 80], 'min_samples_split':[2, 4, 8, 16]}
    
    scoring_fnc = make_scorer(performance_metric)
    
    grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
    grid = grid.fit(X,y)
    
    print(grid.best_score_)
    return grid.best_estimator_



In [12]:

    
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

# Produce the value for optimized parameters
print("Parameters are {} for the optimal model.".format(reg.get_params()))









    



0.888103783867
Parameters are {'min_weight_fraction_leaf': 0.0, 'max_features': 'auto', 'random_state': 0, 'n_estimators': 80, 'verbose': 0, 'min_samples_split': 2, 'max_depth': None, 'min_impurity_split': 1e-07, 'oob_score': False, 'min_samples_leaf': 1, 'warm_start': False, 'criterion': 'mse', 'max_leaf_nodes': None, 'bootstrap': True, 'n_jobs': 1} for the optimal model.

Checkpoint



In [14]:

    
reg = RandomForestRegressor(n_estimators=80, min_samples_split=2, random_state=0).fit(X_train, y_train)



In [15]:

    
fig, ax = plt.subplots(figsize=(8,4))

mean, std = scaled_features['cnt']
predictions = reg.predict(test_features).T*std + mean
ax.plot(predictions[0], label='Prediction')
ax.plot((test_targets['cnt']*std + mean).values, label='Data')
ax.set_xlim(right=len(predictions))
ax.legend()

dates = pd.to_datetime(rides.ix[test_data.index]['dteday'])
dates = dates.apply(lambda d: d.strftime('%b %d'))
ax.set_xticks(np.arange(len(dates))[12::24])
_ = ax.set_xticklabels(dates[12::24], rotation=45)

Neural Network approach using Tensorflow



In [16]:

    
import tensorflow as tf



In [17]:

    
# Remove previous weights, bias, inputs, etc..
tf.reset_default_graph()



In [28]:

    
epochs = 500
learning_rate = 0.1
hidden_nodes = 28
output_nodes = 3
batch_size = 512
display_step = 100

total_len = X_train.shape[0]
n_input = X_train.shape[1]

x = tf.placeholder(shape=[None, n_input], dtype=tf.float32, name='x')
y = tf.placeholder(shape=[None, output_nodes], dtype=tf.float32, name='y')



In [29]:

    
def neural_network(x_tensor, weights, biases):
 
    x = tf.add(tf.matmul(x_tensor, weights['h1']), biases['h1'])
    x = tf.nn.sigmoid(x)
    
    x = tf.add(tf.matmul(x, weights['out']), biases['out'])
    
    return x



In [30]:

    
weights = {
    'h1': tf.Variable(tf.truncated_normal([n_input, hidden_nodes], stddev=0.1)),
    'out': tf.Variable(tf.truncated_normal([hidden_nodes, output_nodes], stddev=0.1))
}

biases = {
    'h1': tf.Variable(tf.zeros(hidden_nodes)),
    'out': tf.Variable(tf.zeros(output_nodes))
}



In [31]:

    
# Construct model
pred = neural_network(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.square(pred-y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)



In [32]:

    
### Launch the graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Training cycle
    for epoch in range(epochs):
        
        avg_cost = 0.
        total_batch = int(total_len/batch_size)
        # Loop over all batches
        for i in range(total_batch-1):
            batch_x = X_train[i*batch_size:(i+1)*batch_size]
            batch_y = y_train[i*batch_size:(i+1)*batch_size]
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c, p = sess.run([optimizer, cost, pred], feed_dict={x: batch_x,
                                                          y: batch_y})
            # Compute average loss
            avg_cost += c / total_batch

        # sample prediction
        label_value = batch_y
        estimate = p
        err = label_value-estimate

        # Display logs per epoch step
        if epoch % display_step == 0:
            print ("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))

    print ("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print ("Accuracy:", accuracy.eval({x: X_test, y: y_test}))
    
    predictions = sess.run([pred],{x: test_features})









    



Epoch: 0001 cost= 0.713052006
Epoch: 0101 cost= 0.048547885
Epoch: 0201 cost= 0.046817406
Epoch: 0301 cost= 0.046626318
Epoch: 0401 cost= 0.046551026
Optimization Finished!
Accuracy: 0.86265



In [34]:

    
fig, ax = plt.subplots(figsize=(8,4))

mean, std = scaled_features['cnt']
prediction = predictions[0].T*std + mean
ax.plot(prediction[0], label='Prediction')
ax.plot((test_targets['cnt']*std + mean).values, label='Data')
ax.set_xlim(right=len(prediction))
ax.legend()

dates = pd.to_datetime(rides.ix[test_data.index]['dteday'])
dates = dates.apply(lambda d: d.strftime('%b %d'))
ax.set_xticks(np.arange(len(dates))[12::24])
_ = ax.set_xticklabels(dates[12::24], rotation=45)



In [ ]:

	instant	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	cnt
count	17379.0000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000
mean	8690.0000	2.501640	0.502561	6.537775	11.546752	0.028770	3.003683	0.682721	1.425283	0.496987	0.475775	0.627229	0.190098	35.676218	153.786869	189.463088
std	5017.0295	1.106918	0.500008	3.438776	6.914405	0.167165	2.005771	0.465431	0.639357	0.192556	0.171850	0.192930	0.122340	49.305030	151.357286	181.387599
min	1.0000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.020000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
25%	4345.5000	2.000000	0.000000	4.000000	6.000000	0.000000	1.000000	0.000000	1.000000	0.340000	0.333300	0.480000	0.104500	4.000000	34.000000	40.000000
50%	8690.0000	3.000000	1.000000	7.000000	12.000000	0.000000	3.000000	1.000000	1.000000	0.500000	0.484800	0.630000	0.194000	17.000000	115.000000	142.000000
75%	13034.5000	3.000000	1.000000	10.000000	18.000000	0.000000	5.000000	1.000000	2.000000	0.660000	0.621200	0.780000	0.253700	48.000000	220.000000	281.000000
max	17379.0000	4.000000	1.000000	12.000000	23.000000	1.000000	6.000000	1.000000	4.000000	1.000000	1.000000	1.000000	0.850700	367.000000	886.000000	977.000000

	temp	hum	casual	registered	cnt	season_1	...	weekday_6
0	0.24	0.81	3	13	16	1	...	1
1	0.22	0.80	8	32	40	1	...	1
2	0.22	0.80	5	27	32	1	...	1
3	0.24	0.75	3	10	13	1	...	1
4	0.24	0.75	0	1	1	1	...	1

Project: Predicting Bike Sharing

Requirements

Exploring the Data

Visualize Feature Distributions

Data preprocessing

One-hot encoding

Scaling target variables

Splitting the data into training, testing, and validation sets

Non NN approach

RandomForestRegressor + GridSearchCV

Checkpoint

Neural Network approach using Tensorflow