In [1]:
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
data_path = 'Bike-Sharing-Dataset/hour.csv'
rides = pd.read_csv(data_path)
print("Bike Sharing dataset has {} data points with {} variables each.".format(*rides.shape))
display(rides.head(n=1))
In [2]:
display(rides.describe())
In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
# Produce a scatter matrix for each pair of features in the data
pd.plotting.scatter_matrix(rides, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
In [4]:
dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday']
for each in dummy_fields:
dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)
rides = pd.concat([rides, dummies], axis=1)
fields_to_drop = ['instant', 'dteday', 'season', 'weathersit',
'weekday', 'atemp', 'mnth', 'workingday', 'hr']
data = rides.drop(fields_to_drop, axis=1)
data.head()
Out[4]:
In [5]:
quant_features = ['casual', 'registered', 'cnt', 'temp', 'hum', 'windspeed']
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for each in quant_features:
mean, std = data[each].mean(), data[each].std()
scaled_features[each] = [mean, std]
data.loc[:, each] = (data[each] - mean)/std
In [6]:
# Save data for approximately the last 21 days
test_data = data[-21*24:]
# Now remove the test data from the data set
data = data[:-21*24]
# Separate the data into features and targets
target_fields = ['cnt', 'casual', 'registered']
features, targets = data.drop(target_fields, axis=1), data[target_fields]
test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]
In [7]:
# Hold out the last 60 days or so of the remaining data as a validation set
train_features, train_targets = features[:-60*24], targets[:-60*24]
val_features, val_targets = features[-60*24:], targets[-60*24:]
In [8]:
from sklearn.cross_validation import train_test_split
# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(train_features, train_targets,
test_size=0.2, train_size=0.8, random_state=50)
# Success
print("Training and testing split was successful.")
In [9]:
from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
""" Calculates and returns the performance score between
true and predicted values based on the metric chosen. """
# Calculate the performance score between 'y_true' and 'y_predict'
score = r2_score(y_true, y_predict, multioutput='variance_weighted')
# Return the score
return score
In [10]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
def fit_model(X, y):
cv_sets = KFold(n_splits = 10, shuffle=False, random_state = 25)
regressor = RandomForestRegressor(random_state = 0)
params = {'n_estimators':[10, 20, 40, 80], 'min_samples_split':[2, 4, 8, 16]}
scoring_fnc = make_scorer(performance_metric)
grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
grid = grid.fit(X,y)
print(grid.best_score_)
return grid.best_estimator_
In [12]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)
# Produce the value for optimized parameters
print("Parameters are {} for the optimal model.".format(reg.get_params()))
In [14]:
reg = RandomForestRegressor(n_estimators=80, min_samples_split=2, random_state=0).fit(X_train, y_train)
In [15]:
fig, ax = plt.subplots(figsize=(8,4))
mean, std = scaled_features['cnt']
predictions = reg.predict(test_features).T*std + mean
ax.plot(predictions[0], label='Prediction')
ax.plot((test_targets['cnt']*std + mean).values, label='Data')
ax.set_xlim(right=len(predictions))
ax.legend()
dates = pd.to_datetime(rides.ix[test_data.index]['dteday'])
dates = dates.apply(lambda d: d.strftime('%b %d'))
ax.set_xticks(np.arange(len(dates))[12::24])
_ = ax.set_xticklabels(dates[12::24], rotation=45)
In [16]:
import tensorflow as tf
In [17]:
# Remove previous weights, bias, inputs, etc..
tf.reset_default_graph()
In [28]:
epochs = 500
learning_rate = 0.1
hidden_nodes = 28
output_nodes = 3
batch_size = 512
display_step = 100
total_len = X_train.shape[0]
n_input = X_train.shape[1]
x = tf.placeholder(shape=[None, n_input], dtype=tf.float32, name='x')
y = tf.placeholder(shape=[None, output_nodes], dtype=tf.float32, name='y')
In [29]:
def neural_network(x_tensor, weights, biases):
x = tf.add(tf.matmul(x_tensor, weights['h1']), biases['h1'])
x = tf.nn.sigmoid(x)
x = tf.add(tf.matmul(x, weights['out']), biases['out'])
return x
In [30]:
weights = {
'h1': tf.Variable(tf.truncated_normal([n_input, hidden_nodes], stddev=0.1)),
'out': tf.Variable(tf.truncated_normal([hidden_nodes, output_nodes], stddev=0.1))
}
biases = {
'h1': tf.Variable(tf.zeros(hidden_nodes)),
'out': tf.Variable(tf.zeros(output_nodes))
}
In [31]:
# Construct model
pred = neural_network(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.square(pred-y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
In [32]:
### Launch the graph
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# Training cycle
for epoch in range(epochs):
avg_cost = 0.
total_batch = int(total_len/batch_size)
# Loop over all batches
for i in range(total_batch-1):
batch_x = X_train[i*batch_size:(i+1)*batch_size]
batch_y = y_train[i*batch_size:(i+1)*batch_size]
# Run optimization op (backprop) and cost op (to get loss value)
_, c, p = sess.run([optimizer, cost, pred], feed_dict={x: batch_x,
y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# sample prediction
label_value = batch_y
estimate = p
err = label_value-estimate
# Display logs per epoch step
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
print ("Optimization Finished!")
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Accuracy:", accuracy.eval({x: X_test, y: y_test}))
predictions = sess.run([pred],{x: test_features})
In [34]:
fig, ax = plt.subplots(figsize=(8,4))
mean, std = scaled_features['cnt']
prediction = predictions[0].T*std + mean
ax.plot(prediction[0], label='Prediction')
ax.plot((test_targets['cnt']*std + mean).values, label='Data')
ax.set_xlim(right=len(prediction))
ax.legend()
dates = pd.to_datetime(rides.ix[test_data.index]['dteday'])
dates = dates.apply(lambda d: d.strftime('%b %d'))
ax.set_xticks(np.arange(len(dates))[12::24])
_ = ax.set_xticklabels(dates[12::24], rotation=45)
In [ ]: