In [179]:
import pandas as pd
dataframe = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=1000)

In [180]:
import numpy as np
import pandas as pd
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""

def normalize_features(array):
    """
    Normalize the features in the data set.
    """
    #array_normalized = (array-array.mean())/array.std()

    array_normalized = np.divide((np.subtract(array,array.mean())),array.std())
    
    mu = array.mean()
    sigma = array.std()
    print "sigma", sigma

    return array_normalized, mu, sigma

features = dataframe[['rain', 'precipi', 'meanwindspdi', 'thunder']].copy()
print features.head()
features, mu, sigma = normalize_features(features)
print features


   rain  precipi  meanwindspdi  thunder
0     0        0             5        0
1     0        0             5        0
2     0        0             5        0
3     0        0             5        0
4     0        0             5        0
sigma rain            0
precipi         0
meanwindspdi    0
thunder         0
dtype: float64
     rain  precipi  meanwindspdi  thunder
0     NaN      NaN           NaN      NaN
1     NaN      NaN           NaN      NaN
2     NaN      NaN           NaN      NaN
3     NaN      NaN           NaN      NaN
4     NaN      NaN           NaN      NaN
5     NaN      NaN           NaN      NaN
6     NaN      NaN           NaN      NaN
7     NaN      NaN           NaN      NaN
8     NaN      NaN           NaN      NaN
9     NaN      NaN           NaN      NaN
10    NaN      NaN           NaN      NaN
11    NaN      NaN           NaN      NaN
12    NaN      NaN           NaN      NaN
13    NaN      NaN           NaN      NaN
14    NaN      NaN           NaN      NaN
15    NaN      NaN           NaN      NaN
16    NaN      NaN           NaN      NaN
17    NaN      NaN           NaN      NaN
18    NaN      NaN           NaN      NaN
19    NaN      NaN           NaN      NaN
20    NaN      NaN           NaN      NaN
21    NaN      NaN           NaN      NaN
22    NaN      NaN           NaN      NaN
23    NaN      NaN           NaN      NaN
24    NaN      NaN           NaN      NaN
25    NaN      NaN           NaN      NaN
26    NaN      NaN           NaN      NaN
27    NaN      NaN           NaN      NaN
28    NaN      NaN           NaN      NaN
29    NaN      NaN           NaN      NaN
..    ...      ...           ...      ...
970   NaN      NaN           NaN      NaN
971   NaN      NaN           NaN      NaN
972   NaN      NaN           NaN      NaN
973   NaN      NaN           NaN      NaN
974   NaN      NaN           NaN      NaN
975   NaN      NaN           NaN      NaN
976   NaN      NaN           NaN      NaN
977   NaN      NaN           NaN      NaN
978   NaN      NaN           NaN      NaN
979   NaN      NaN           NaN      NaN
980   NaN      NaN           NaN      NaN
981   NaN      NaN           NaN      NaN
982   NaN      NaN           NaN      NaN
983   NaN      NaN           NaN      NaN
984   NaN      NaN           NaN      NaN
985   NaN      NaN           NaN      NaN
986   NaN      NaN           NaN      NaN
987   NaN      NaN           NaN      NaN
988   NaN      NaN           NaN      NaN
989   NaN      NaN           NaN      NaN
990   NaN      NaN           NaN      NaN
991   NaN      NaN           NaN      NaN
992   NaN      NaN           NaN      NaN
993   NaN      NaN           NaN      NaN
994   NaN      NaN           NaN      NaN
995   NaN      NaN           NaN      NaN
996   NaN      NaN           NaN      NaN
997   NaN      NaN           NaN      NaN
998   NaN      NaN           NaN      NaN
999   NaN      NaN           NaN      NaN

[1000 rows x 4 columns]

In [181]:
def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
    cost = sum_of_square_errors / (2.0*m)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    cost_history = []

    for i in range(40):
        new_theta = theta
        for j in xrange(0,len(theta)):
            new_theta[j] = theta[j] + (1.0*alpha/m) * (values - np.dot(features, theta)).dot(features[:,j])
        
        theta = new_theta
        
        cost_history.append(compute_cost(features, values, theta))
    return theta, pd.Series(cost_history)

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''

    # Values
    values = dataframe[['ENTRIESn_hourly']]
    m = len(values)
    
    # Select Features (try different features!)
    features = dataframe[['rain', 'precipi', 'meanwindspdi', 'thunder']].copy()
    features, mu, sigma = normalize_features(features)

    # Add UNIT to features using dummy variables
    #dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    #features = features.join(dummy_units)
    
    features['ones'] = pd.Series(np.ones(m)) # Add a column of 1s (y intercept)

    #print features
    
    # Convert features and values to numpy arrays
    features_array = np.array(features)
    values_array = np.array(values).flatten()
    #print "features", features_array
    #print "values", values_array

    # Set values for alpha, number of iterations.
    alpha = 0.01 # please feel free to change this value
    num_iterations = 75 # please feel free to change this value

    # Initialize theta, perform gradient descent
    theta_gradient_descent = np.zeros(len(features.columns))
    theta_gradient_descent, cost_history = gradient_descent(features_array, 
                                                            values_array, 
                                                            theta_gradient_descent, 
                                                            alpha, 
                                                            num_iterations)
    
    #print "theta", theta_gradient_descent
    #print "cost", cost_history
    
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    #plot = plot_cost_history(alpha, cost_history)
    # 
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed 
    # the 30 second limit on the compute servers.
    
    #print features_array, theta_gradient_descent
    
    predictions = np.dot(features_array, theta_gradient_descent)
    #print predictions
    return predictions, plot


def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pd.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )

In [182]:
predictions, plot = predictions(dataframe)

values = dataframe[['ENTRIESn_hourly']]

r_squared = 1 - np.sum((np.square(values.values - predictions)))/np.sum(np.square(values.values - np.mean(values.values)))


sigma rain            0
precipi         0
meanwindspdi    0
thunder         0
dtype: float64

In [183]:
print r_squared


nan

finally started working, kinda (without dummy variables)

[ 81.71874535 17.32919409 129.93160132 -95.33262634 23.71990046
-21.84393317 14.18507349 -94.99585299 21.48823584 0.59198864]
Your r^2 value is 0.11123162895
The plot of your cost history is shown below.


In [186]:
import numpy as np
import pandas
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""

def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
    cost = sum_of_square_errors / (2.0*m)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    
    cost_history = []
    
    for iteration in xrange(0,num_iterations):

        new_theta = theta
        for j in xrange(0,len(theta)):
            #print j, features[:,j]
            new_theta[j] = theta[j] + ((1.0*alpha)/m) * (values - np.dot(features, theta)).dot(features[:,j])
        
        theta = new_theta
        
        cost_history.append(compute_cost(features, values, theta))
        
    return theta, pandas.Series(cost_history)

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''

    # Select Features (try different features!)
    features = dataframe[['Hour', 'maxpressurei', 'maxdewpti', 'mindewpti', 'meandewpti', 'meanpressurei', 'rain', 'mintempi', 'maxtempi']].copy()
    
    # Add UNIT to features using dummy variables
    #dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit')
    #features = features.join(dummy_units)
    
    # Values
    values = dataframe[['ENTRIESn_hourly']]
    m = len(values)

    #features, mu, sigma = normalize_features(features)
    features['ones'] = np.ones(m) # Add a column of 1s (y intercept)
    print features.head
    
    # Convert features and values to numpy arrays
    features_array = np.array(features)
    values_array = np.array(values).flatten()

    # Set values for alpha, number of iterations.
    alpha = 0.0003 # please feel free to change this value
    num_iterations = 1500 # please feel free to change this value

    # Initialize theta, perform gradient descent
    theta_gradient_descent = np.zeros(len(features.columns))
    theta_gradient_descent, cost_history = gradient_descent(features_array, 
                                                            values_array, 
                                                            theta_gradient_descent, 
                                                            alpha, 
                                                            num_iterations)
    
    print theta_gradient_descent
    
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    plot = plot_cost_history(alpha, cost_history)
    # 
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed 
    # the 30 second limit on the compute servers.
    
    predictions = np.dot(features_array, theta_gradient_descent)
    return predictions, plot


def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pandas.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )

In [ ]: