In [179]:
import pandas as pd
dataframe = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=1000)
In [180]:
import numpy as np
import pandas as pd
from ggplot import *
"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.
"""
def normalize_features(array):
"""
Normalize the features in the data set.
"""
#array_normalized = (array-array.mean())/array.std()
array_normalized = np.divide((np.subtract(array,array.mean())),array.std())
mu = array.mean()
sigma = array.std()
print "sigma", sigma
return array_normalized, mu, sigma
features = dataframe[['rain', 'precipi', 'meanwindspdi', 'thunder']].copy()
print features.head()
features, mu, sigma = normalize_features(features)
print features
In [181]:
def compute_cost(features, values, theta):
"""
Compute the cost function given a set of features / values,
and the values for our thetas.
This can be the same code as the compute_cost function in the lesson #3 exercises,
but feel free to implement your own.
"""
m = len(values)
sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
cost = sum_of_square_errors / (2.0*m)
return cost
def gradient_descent(features, values, theta, alpha, num_iterations):
"""
Perform gradient descent given a data set with an arbitrary number of features.
This can be the same gradient descent code as in the lesson #3 exercises,
but feel free to implement your own.
"""
m = len(values)
cost_history = []
for i in range(40):
new_theta = theta
for j in xrange(0,len(theta)):
new_theta[j] = theta[j] + (1.0*alpha/m) * (values - np.dot(features, theta)).dot(features[:,j])
theta = new_theta
cost_history.append(compute_cost(features, values, theta))
return theta, pd.Series(cost_history)
def predictions(dataframe):
'''
The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
Using the information stored in the dataframe, let's predict the ridership of
the NYC subway using linear regression with gradient descent.
You can download the complete turnstile weather dataframe here:
https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
Your prediction should have a R^2 value of 0.20 or better.
You need to experiment using various input features contained in the dataframe.
We recommend that you don't use the EXITSn_hourly feature as an input to the
linear model because we cannot use it as a predictor: we cannot use exits
counts as a way to predict entry counts.
Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
give you a random subet (~15%) of the data contained in
turnstile_data_master_with_weather.csv. You are encouraged to experiment with
this computer on your own computer, locally.
If you'd like to view a plot of your cost history, uncomment the call to
plot_cost_history below. The slowdown from plotting is significant, so if you
are timing out, the first thing to do is to comment out the plot command again.
If you receive a "server has encountered an error" message, that means you are
hitting the 30-second limit that's placed on running your program. Try using a
smaller number for num_iterations if that's the case.
If you are using your own algorithm/models, see if you can optimize your code so
that it runs faster.
'''
# Values
values = dataframe[['ENTRIESn_hourly']]
m = len(values)
# Select Features (try different features!)
features = dataframe[['rain', 'precipi', 'meanwindspdi', 'thunder']].copy()
features, mu, sigma = normalize_features(features)
# Add UNIT to features using dummy variables
#dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
#features = features.join(dummy_units)
features['ones'] = pd.Series(np.ones(m)) # Add a column of 1s (y intercept)
#print features
# Convert features and values to numpy arrays
features_array = np.array(features)
values_array = np.array(values).flatten()
#print "features", features_array
#print "values", values_array
# Set values for alpha, number of iterations.
alpha = 0.01 # please feel free to change this value
num_iterations = 75 # please feel free to change this value
# Initialize theta, perform gradient descent
theta_gradient_descent = np.zeros(len(features.columns))
theta_gradient_descent, cost_history = gradient_descent(features_array,
values_array,
theta_gradient_descent,
alpha,
num_iterations)
#print "theta", theta_gradient_descent
#print "cost", cost_history
plot = None
# -------------------------------------------------
# Uncomment the next line to see your cost history
# -------------------------------------------------
#plot = plot_cost_history(alpha, cost_history)
#
# Please note, there is a possibility that plotting
# this in addition to your calculation will exceed
# the 30 second limit on the compute servers.
#print features_array, theta_gradient_descent
predictions = np.dot(features_array, theta_gradient_descent)
#print predictions
return predictions, plot
def plot_cost_history(alpha, cost_history):
"""This function is for viewing the plot of your cost history.
You can run it by uncommenting this
plot_cost_history(alpha, cost_history)
call in predictions.
If you want to run this locally, you should print the return value
from this function.
"""
cost_df = pd.DataFrame({
'Cost_History': cost_history,
'Iteration': range(len(cost_history))
})
return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )
In [182]:
predictions, plot = predictions(dataframe)
values = dataframe[['ENTRIESn_hourly']]
r_squared = 1 - np.sum((np.square(values.values - predictions)))/np.sum(np.square(values.values - np.mean(values.values)))
In [183]:
print r_squared
[ 81.71874535 17.32919409 129.93160132 -95.33262634 23.71990046
-21.84393317 14.18507349 -94.99585299 21.48823584 0.59198864]
Your r^2 value is 0.11123162895
The plot of your cost history is shown below.
In [186]:
import numpy as np
import pandas
from ggplot import *
"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.
"""
def normalize_features(array):
"""
Normalize the features in the data set.
"""
array_normalized = (array-array.mean())/array.std()
mu = array.mean()
sigma = array.std()
return array_normalized, mu, sigma
def compute_cost(features, values, theta):
"""
Compute the cost function given a set of features / values,
and the values for our thetas.
This can be the same code as the compute_cost function in the lesson #3 exercises,
but feel free to implement your own.
"""
m = len(values)
sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
cost = sum_of_square_errors / (2.0*m)
return cost
def gradient_descent(features, values, theta, alpha, num_iterations):
"""
Perform gradient descent given a data set with an arbitrary number of features.
This can be the same gradient descent code as in the lesson #3 exercises,
but feel free to implement your own.
"""
m = len(values)
cost_history = []
for iteration in xrange(0,num_iterations):
new_theta = theta
for j in xrange(0,len(theta)):
#print j, features[:,j]
new_theta[j] = theta[j] + ((1.0*alpha)/m) * (values - np.dot(features, theta)).dot(features[:,j])
theta = new_theta
cost_history.append(compute_cost(features, values, theta))
return theta, pandas.Series(cost_history)
def predictions(dataframe):
'''
The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
Using the information stored in the dataframe, let's predict the ridership of
the NYC subway using linear regression with gradient descent.
You can download the complete turnstile weather dataframe here:
https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
Your prediction should have a R^2 value of 0.20 or better.
You need to experiment using various input features contained in the dataframe.
We recommend that you don't use the EXITSn_hourly feature as an input to the
linear model because we cannot use it as a predictor: we cannot use exits
counts as a way to predict entry counts.
Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
give you a random subet (~15%) of the data contained in
turnstile_data_master_with_weather.csv. You are encouraged to experiment with
this computer on your own computer, locally.
If you'd like to view a plot of your cost history, uncomment the call to
plot_cost_history below. The slowdown from plotting is significant, so if you
are timing out, the first thing to do is to comment out the plot command again.
If you receive a "server has encountered an error" message, that means you are
hitting the 30-second limit that's placed on running your program. Try using a
smaller number for num_iterations if that's the case.
If you are using your own algorithm/models, see if you can optimize your code so
that it runs faster.
'''
# Select Features (try different features!)
features = dataframe[['Hour', 'maxpressurei', 'maxdewpti', 'mindewpti', 'meandewpti', 'meanpressurei', 'rain', 'mintempi', 'maxtempi']].copy()
# Add UNIT to features using dummy variables
#dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit')
#features = features.join(dummy_units)
# Values
values = dataframe[['ENTRIESn_hourly']]
m = len(values)
#features, mu, sigma = normalize_features(features)
features['ones'] = np.ones(m) # Add a column of 1s (y intercept)
print features.head
# Convert features and values to numpy arrays
features_array = np.array(features)
values_array = np.array(values).flatten()
# Set values for alpha, number of iterations.
alpha = 0.0003 # please feel free to change this value
num_iterations = 1500 # please feel free to change this value
# Initialize theta, perform gradient descent
theta_gradient_descent = np.zeros(len(features.columns))
theta_gradient_descent, cost_history = gradient_descent(features_array,
values_array,
theta_gradient_descent,
alpha,
num_iterations)
print theta_gradient_descent
plot = None
# -------------------------------------------------
# Uncomment the next line to see your cost history
# -------------------------------------------------
plot = plot_cost_history(alpha, cost_history)
#
# Please note, there is a possibility that plotting
# this in addition to your calculation will exceed
# the 30 second limit on the compute servers.
predictions = np.dot(features_array, theta_gradient_descent)
return predictions, plot
def plot_cost_history(alpha, cost_history):
"""This function is for viewing the plot of your cost history.
You can run it by uncommenting this
plot_cost_history(alpha, cost_history)
call in predictions.
If you want to run this locally, you should print the return value
from this function.
"""
cost_df = pandas.DataFrame({
'Cost_History': cost_history,
'Iteration': range(len(cost_history))
})
return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )
In [ ]: