In [212]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# import data
dataframe = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=1000)

In [213]:
import numpy as np
import pandas as pd
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""

def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

def compute_cost(X, y, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    m = len(y)
    
    cost = (1.0/(2.0*m)) * (((X.dot(theta)) - y).T).dot(X.dot(theta) - y)

    return cost

# delta-vector function for derivatives
def deltas(X, y, theta):
    m = len(y)
    delta = np.zeros(((171+1),1))

    delta = ((1.0/m) * (X.dot(theta) - y).T.dot(X)).T
    return delta

def gradient_descent(X, y, theta, alpha, iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """

    # store cost function values for plotting
    J_values = np.zeros((iterations,1))

    # gradient descent
    for iteration in xrange(0,iterations):
        theta = theta - (alpha[7] * deltas(X, y, theta))
 
    return theta

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''
    # initial data vectors
    x = dataframe[['Hour', 'maxpressurei', 'maxdewpti', 'mindewpti', 'meandewpti', 'meanpressurei', 'mintempi', 'maxtempi']]
    y = dataframe['ENTRIESn_hourly'].values

    # training sample size
    m = len(y)

    # proper vector shape
    y.shape = (m,1)

    # Add UNIT to features using dummy variables
    dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    x = x.join(dummy_units)
    x = x.values

    # number of features
    n = len(x[0,:])

    # design matrix
    X = np.hstack((np.ones((m,1)),x))

    # theta parameters 
    theta = np.zeros(((n+1),1))

    #gradient descent, number of iterations
    iterations = 500

    # learning rates to try
    alpha = [-0.3, -0.1, -0.03, -0.01, -0.003, -0.001, 0, 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3]
    a_index = 7

    theta = gradient_descent(X, y, theta, alpha, iterations)
    
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    # plot = plot_cost_history(alpha, cost_history)
    # 
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed 
    # the 30 second limit on the compute servers.
    
    predictions = np.dot(X, theta)
    return predictions, plot


def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pd.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )

In [214]:
predictions = predictions(dataframe)

removed plot and normalize


In [215]:
import numpy as np
import pandas as pd
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""
def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

def compute_cost(X, y, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    m = len(y)
    
    cost = (1.0/(2.0*m)) * (((X.dot(theta)) - y).T).dot(X.dot(theta) - y)

    return cost

def gradient_descent(X, y, theta, alpha, iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """

    # delta-vector function for derivatives
    def deltas(X, y, theta):
        m = len(y)
        delta = np.zeros(((171+1),1))

        delta = ((1.0/m) * (X.dot(theta) - y).T.dot(X)).T
        return delta

    # store cost function values for plotting
    J_values = np.zeros((iterations,1))

    # gradient descent
    for iteration in xrange(0,iterations):
        theta = theta - (alpha * deltas(X, y, theta))
 
    return theta

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''
    # initial data vectors
    x = dataframe[['Hour', 'maxpressurei', 'maxdewpti', 'mindewpti', 'meandewpti', 'meanpressurei', 'mintempi', 'maxtempi']]
    y = dataframe['ENTRIESn_hourly'].values

    # training sample size
    m = len(y)

    # proper vector shape
    y.shape = (m,1)

    # Add UNIT to features using dummy variables
    dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    x = x.join(dummy_units)
    x = x.values

    # number of features
    n = len(x[0,:])

    # design matrix
    X = np.hstack((np.ones((m,1)),x))

    # theta parameters 
    theta = np.zeros(((n+1),1))

    #gradient descent, number of iterations
    iterations = 500

    # learning rates to try
    alpha = [-0.3, -0.1, -0.03, -0.01, -0.003, -0.001, 0, 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3]
    a_index = 7

    theta = gradient_descent(X, y, theta, alpha, iterations)
    
    predictions = np.dot(X, theta)
    return predictions

def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pd.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )

In [216]:
predictions = predictions(dataframe)

Their lingo


In [217]:
import numpy as np
import pandas as pd
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""
def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    m = len(values)
    
    cost = (1.0/(2.0*m)) * (((features.dot(theta)) - values).T).dot(features.dot(theta) - values)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """

    m = len(values)
    cost_history = []

    # gradient descent
    for iteration in xrange(0,num_iterations):
        theta = theta - (alpha * (((1.0/m) * (features.dot(theta) - values).T.dot(features)).T))
        cost_history.append(compute_cost(features, values, theta))
 
    return theta

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''
    # initial data vectors
    x = dataframe[['Hour', 'maxpressurei', 'maxdewpti', 'mindewpti', 'meandewpti', 'meanpressurei', 'mintempi', 'maxtempi']]
    values = dataframe['ENTRIESn_hourly'].values

    # training sample size
    m = len(values)

    # proper vector shape
    values.shape = (m,1)

    # Add UNIT to features using dummy variables
    dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    x = x.join(dummy_units)
    x = x.values

    # number of features
    n = len(x[0,:])

    # design matrix
    features = np.hstack((x,np.ones((m,1))))
    print features
    # theta parameters 
    theta = np.zeros(((n+1),1))

    #gradient descent, number of iterations
    iterations = 400

    # learning rates to try
    alpha = 0.0001

    theta = gradient_descent(features, values, theta, alpha, iterations)

    # calculate r^2 (after gradient descent) 
    r_squared = 1 - np.sum((np.square(values - np.dot(features, theta))))/np.sum(np.square(values - np.mean(values)))
    print r_squared
    
    plot = None
    
    predictions = np.dot(features, theta)
    return predictions, plot

def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pd.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )

In [218]:
predictions = predictions(dataframe)
predictions


[[  1.    30.31  42.   ...,   0.     0.     1.  ]
 [  5.    30.31  42.   ...,   0.     0.     1.  ]
 [  9.    30.31  42.   ...,   0.     0.     1.  ]
 ..., 
 [  1.    30.31  42.   ...,   0.     1.     1.  ]
 [  5.    30.31  42.   ...,   0.     1.     1.  ]
 [  9.    30.31  42.   ...,   0.     1.     1.  ]]
0.202824343075
Out[218]:
(array([[  187.97876277],
        [  621.64087253],
        [ 1055.30298228],
        [ 1488.96509204],
        [ 1922.62720179],
        [ 2356.28931155],
        [  187.59280897],
        [  621.25491873],
        [ 1054.91702848],
        [ 1488.57913824],
        [ 1922.24124799],
        [ 2355.90335775],
        [   79.1687287 ],
        [  512.83083846],
        [ 1380.15505796],
        [ 1813.81716772],
        [ 2247.47927747],
        [   79.12702268],
        [  512.78913243],
        [  946.45124219],
        [ 1380.11335194],
        [ 1813.77546169],
        [ 2247.43757145],
        [   79.1687687 ],
        [  512.83087845],
        [ 1380.15509796],
        [ 1813.81720771],
        [ 2247.47931747],
        [   79.12722265],
        [  512.78933241],
        [  946.45144216],
        [ 1380.11355192],
        [ 1813.77566167],
        [ 2247.43777142],
        [   79.24430368],
        [  512.90641343],
        [ 1380.23063294],
        [ 1813.89274269],
        [   79.1687287 ],
        [  512.83083846],
        [ 1380.15505796],
        [ 1813.81716772],
        [ 2247.47927747],
        [   79.1687687 ],
        [  512.83087845],
        [ 1380.15509796],
        [ 1813.81720771],
        [ 2247.47931747],
        [  296.47782509],
        [  730.13993484],
        [ 1163.8020446 ],
        [ 1597.46415435],
        [ 2031.1262641 ],
        [ 2464.78837386],
        [   80.08706776],
        [  513.74917751],
        [  947.41128726],
        [ 1381.07339702],
        [ 1814.73550677],
        [ 2248.39761653],
        [   80.11802937],
        [  296.94908425],
        [  513.78013912],
        [  947.44224888],
        [ 1381.10435863],
        [ 1814.76646838],
        [ 2248.42857814],
        [   79.38979122],
        [  513.05190098],
        [  946.71401073],
        [ 1380.37612049],
        [ 1814.03823024],
        [ 2247.70033999],
        [  187.58980933],
        [  621.25191909],
        [ 1054.91402884],
        [ 1488.5761386 ],
        [ 1922.23824835],
        [ 2355.9003581 ],
        [  296.08055264],
        [  729.7426624 ],
        [ 1163.40477215],
        [ 1597.06688191],
        [ 2030.72899166],
        [ 2464.39110141],
        [   79.24116148],
        [  512.90327123],
        [ 1380.22749074],
        [ 1813.88960049],
        [ 2247.55171025],
        [   79.39651042],
        [  513.05862017],
        [  946.72072993],
        [ 1380.38283968],
        [ 1814.04494944],
        [ 2247.70705919],
        [   80.28364422],
        [  513.94575398],
        [  947.60786373],
        [ 1381.26997349],
        [ 1814.93208324],
        [ 2248.594193  ],
        [   79.41878775],
        [  513.08089751],
        [  946.74300726],
        [ 1380.40511701],
        [ 1814.06722677],
        [ 2247.72933652],
        [   79.61340446],
        [  513.27551421],
        [  946.93762396],
        [ 1380.59973372],
        [ 1814.26184347],
        [ 2247.92395323],
        [   79.52845462],
        [  513.19056438],
        [  946.85267413],
        [ 1380.51478389],
        [ 1814.17689364],
        [ 2247.8390034 ],
        [   80.37919279],
        [  514.04130254],
        [  947.7034123 ],
        [ 1381.36552205],
        [ 1815.0276318 ],
        [ 2248.68974156],
        [   79.83781759],
        [  513.49992735],
        [  947.1620371 ],
        [ 1380.82414685],
        [ 1814.48625661],
        [ 2248.14836636],
        [   79.47054156],
        [  513.13265131],
        [  946.79476107],
        [ 1380.45687082],
        [ 1814.11898057],
        [ 2247.78109033],
        [   79.60588536],
        [  513.26799511],
        [  946.93010486],
        [ 1380.59221462],
        [ 1814.25432437],
        [ 2247.91643413],
        [   79.31604005],
        [  512.97814981],
        [  946.64025956],
        [ 1380.30236931],
        [ 1813.96447907],
        [ 2247.62658882],
        [  404.61497072],
        [  838.27708048],
        [ 1271.93919023],
        [ 1705.60129999],
        [ 2139.26340974],
        [ 2572.9255195 ],
        [   79.74602858],
        [  513.40813833],
        [  947.07024809],
        [ 1380.73235784],
        [ 1814.3944676 ],
        [ 2248.05657735],
        [   79.18027686],
        [  512.84238661],
        [  946.50449637],
        [ 1380.16660612],
        [ 1596.997661  ],
        [ 1596.997661  ],
        [ 1596.997661  ],
        [ 1813.82871587],
        [ 2247.49082563],
        [   79.58236817],
        [  513.24447793],
        [  946.90658768],
        [ 1380.56869743],
        [ 1814.23080719],
        [ 2247.89291694],
        [   79.40822902],
        [  513.07033877],
        [  946.73244852],
        [ 1380.39455828],
        [ 1814.05666803],
        [ 2247.71877779],
        [   80.49621878],
        [  514.15832853],
        [  947.82043829],
        [ 1381.48254804],
        [ 1815.1446578 ],
        [ 2248.80676755],
        [   79.29992198],
        [  512.96203174],
        [  946.62414149],
        [ 1380.28625124],
        [ 1813.948361  ],
        [ 2247.61047075],
        [   79.42955483],
        [  513.09166458],
        [  946.75377434],
        [ 1380.41588409],
        [ 1814.07799384],
        [ 2247.7401036 ],
        [ 2464.57115848],
        [   79.27519808],
        [  512.93730783],
        [ 1380.26152734],
        [ 1813.9236371 ],
        [ 2247.58574685],
        [   79.25608723],
        [  512.91819698],
        [  946.58030674],
        [ 1380.24241649],
        [ 1813.90452625],
        [ 2247.566636  ],
        [   79.15821894],
        [  512.8203287 ],
        [  946.48243845],
        [ 1380.14454821],
        [ 1813.80665796],
        [ 2247.46876771],
        [   79.24692832],
        [  512.90903808],
        [  946.57114783],
        [ 1380.23325759],
        [ 1813.89536734],
        [ 2247.5574771 ],
        [   79.33155246],
        [  512.99366221],
        [ 1380.31788172],
        [ 1813.97999148],
        [ 2247.64210123],
        [   79.37871255],
        [  513.0408223 ],
        [  946.70293206],
        [ 1380.36504181],
        [ 1814.02715157],
        [ 2247.68926132],
        [   79.29151645],
        [  512.95362621],
        [ 1380.27784572],
        [ 1813.93995547],
        [ 2247.60206522],
        [   78.44973608],
        [  186.86526352],
        [  295.28079096],
        [  403.6963184 ],
        [  512.11184584],
        [  620.52737328],
        [  728.94290071],
        [  837.35842815],
        [  945.77395559],
        [ 1054.18948303],
        [ 1162.60501047],
        [ 1271.02053791],
        [ 1379.43606535],
        [ 1487.85159278],
        [ 1596.26712022],
        [ 1704.68264766],
        [ 1813.0981751 ],
        [ 1921.51370254],
        [ 2029.92922998],
        [ 2138.34475742],
        [ 2246.76028485],
        [ 2355.17581229],
        [ 2463.59133973],
        [ 2572.00686717],
        [   79.65939895],
        [  513.3215087 ],
        [  946.98361846],
        [ 1380.64572821],
        [ 1814.30783797],
        [ 2247.96994772],
        [  187.69351692],
        [  621.35562667],
        [ 1055.01773643],
        [ 1488.67984618],
        [ 1922.34195594],
        [ 2356.00406569],
        [   80.04335299],
        [  513.70546274],
        [  947.3675725 ],
        [ 1381.02968225],
        [ 1814.69179201],
        [ 2248.35390176],
        [  187.81890191],
        [  621.48101166],
        [ 1055.14312142],
        [ 1488.80523117],
        [ 1922.46734093],
        [ 2356.12945068],
        [  187.59656852],
        [  621.25867828],
        [ 1054.92078803],
        [ 1488.58289779],
        [ 1922.24500754],
        [ 2355.9071173 ],
        [   79.43250611],
        [  513.09461586],
        [  946.75672562],
        [ 1380.41883537],
        [ 1814.08094513],
        [ 2247.74305488],
        [   79.54229297],
        [  513.20440272],
        [  946.86651248],
        [ 1380.52862223],
        [ 1814.19073198],
        [ 2247.85284174],
        [   79.72519107],
        [  513.38730083],
        [  947.04941058],
        [ 1380.71152034],
        [ 1814.37363009],
        [ 2248.03573985],
        [   79.29068309],
        [  512.95279284],
        [  946.6149026 ],
        [ 1380.27701235],
        [ 1813.9391221 ],
        [ 2247.60123186],
        [   79.38262166],
        [  513.04473141],
        [  946.70684117],
        [ 1380.36895092],
        [ 1597.2000058 ],
        [ 1814.03106068],
        [ 2247.69317043],
        [   79.08464684],
        [  512.74675659],
        [  946.40886635],
        [ 1380.0709761 ],
        [ 1380.0709761 ],
        [ 1380.0709761 ],
        [ 1488.48650354],
        [ 1813.73308585],
        [ 2247.39519561],
        [   80.1940182 ],
        [  513.85612796],
        [  947.51823771],
        [ 1381.18034747],
        [ 1814.84245722],
        [ 2031.6735121 ],
        [ 2248.50456697],
        [   79.28436384],
        [  512.9464736 ],
        [  946.60858335],
        [ 1380.27069311],
        [ 1813.93280286],
        [ 2247.59491261],
        [   79.77486513],
        [  513.43697488],
        [  947.09908464],
        [ 1380.76119439],
        [ 1814.42330414],
        [ 2248.0854139 ],
        [   79.21949161],
        [  512.88160136],
        [  946.54371112],
        [ 1380.20582087],
        [ 1813.86793063],
        [ 2247.53004038],
        [   79.29336277],
        [  512.95547252],
        [  946.61758227],
        [ 1380.27969203],
        [ 1813.94180178],
        [ 2247.60391154],
        [   79.49985805],
        [  513.1619678 ],
        [  946.82407756],
        [ 1380.48618731],
        [ 1814.14829706],
        [ 2247.81040682],
        [   79.24968063],
        [  512.91179038],
        [ 1380.23600989],
        [ 1813.89811964],
        [ 2247.5602294 ],
        [   79.55901097],
        [  513.22112072],
        [  946.88323048],
        [ 1380.54534023],
        [ 1814.20744998],
        [ 2247.86955974],
        [   79.29579603],
        [  512.95790578],
        [ 1380.28212529],
        [ 1813.94423504],
        [ 2247.6063448 ],
        [   79.21665195],
        [  512.8787617 ],
        [  946.54087146],
        [ 1380.20298121],
        [ 1813.86509097],
        [ 2247.52720072],
        [   79.26743885],
        [  512.92954861],
        [ 1380.25376812],
        [ 1813.91587787],
        [ 2247.57798763],
        [   79.20676491],
        [  512.86887466],
        [ 1380.19309417],
        [ 1813.85520392],
        [ 2247.51731368],
        [   79.27035856],
        [  512.93246832],
        [ 1380.25668783],
        [ 1813.91879758],
        [ 2247.58090733],
        [   79.18465578],
        [  512.84676553],
        [  946.50887529],
        [ 1380.17098504],
        [ 1813.8330948 ],
        [ 2247.49520455],
        [   79.23528972],
        [  512.89739947],
        [  946.55950923],
        [ 1380.22161898],
        [ 1813.88372873],
        [ 2247.54583849],
        [   79.28004436],
        [  512.94215411],
        [  946.60426387],
        [ 1380.26637362],
        [ 1813.92848338],
        [ 2247.59059313],
        [  296.27440944],
        [  729.93651919],
        [ 1163.59862895],
        [ 1597.2607387 ],
        [ 2030.92284845],
        [ 2464.58495821],
        [   79.59060718],
        [  513.25271694],
        [  946.91482669],
        [ 1380.57693645],
        [ 1814.2390462 ],
        [ 2247.90115596],
        [   79.29813983],
        [  512.96024959],
        [  946.62235934],
        [ 1380.28446909],
        [ 1813.94657885],
        [ 2030.77763373],
        [ 2139.19316116],
        [ 2139.19316116],
        [ 2247.6086886 ],
        [   79.27424505],
        [  512.93635481],
        [  946.59846456],
        [ 1380.26057432],
        [ 1813.92268407],
        [ 2247.58479383],
        [   79.45946288],
        [  513.12157264],
        [  946.78368239],
        [ 1380.44579215],
        [ 1814.1079019 ],
        [ 2247.77001165],
        [   80.18055163],
        [  513.84266139],
        [  947.50477114],
        [ 1381.16688089],
        [ 1489.58240833],
        [ 1814.82899065],
        [ 1814.82899065],
        [ 2248.4911004 ],
        [ 2573.73768272],
        [   79.46650204],
        [  513.12861179],
        [  946.79072155],
        [ 1380.4528313 ],
        [ 1814.11494106],
        [ 2247.77705081],
        [   79.13339842],
        [  512.79550818],
        [  946.45761793],
        [ 1054.87314537],
        [ 1054.87314537],
        [ 1054.87314537],
        [ 1054.87314537],
        [ 1054.87314537],
        [ 1380.11972769],
        [ 1813.78183744],
        [ 2247.44394719],
        [   79.23029032],
        [  512.89240007],
        [  946.55450982],
        [ 1380.21661958],
        [ 1813.87872933],
        [ 2247.54083909],
        [  187.6293246 ],
        [  621.29143436],
        [ 1054.95354411],
        [ 1488.61565387],
        [ 1922.27776362],
        [ 2355.93987337],
        [   79.19116646],
        [  512.85327622],
        [ 1380.17749573],
        [ 1813.83960548],
        [ 2247.50171523],
        [   79.23460213],
        [  512.89671188],
        [ 1380.22093139],
        [ 1813.88304115],
        [ 2247.5451509 ],
        [   79.2872435 ],
        [  512.94935325],
        [  946.61146301],
        [ 1380.27357276],
        [ 1813.93568252],
        [ 2247.59779227],
        [   79.10161103],
        [  512.76372078],
        [  946.42583054],
        [ 1380.08794029],
        [ 1488.50346773],
        [ 1488.50346773],
        [ 1596.91899517],
        [ 1596.91899517],
        [ 1813.75005005],
        [ 2247.4121598 ],
        [   79.48941671],
        [  513.15152647],
        [ 1380.47574597],
        [ 1814.13785573],
        [ 2247.79996548],
        [   79.39183098],
        [  513.05394073],
        [  946.71605049],
        [ 1380.37816024],
        [ 1814.04027   ],
        [ 2247.70237975],
        [   79.42186738],
        [  513.08397714],
        [  946.74608689],
        [ 1380.40819665],
        [ 1814.0703064 ],
        [ 2247.73241615],
        [   79.49653845],
        [  513.1586482 ],
        [  946.82075795],
        [ 1380.48286771],
        [ 1814.14497746],
        [ 2247.80708722],
        [   79.63820149],
        [  513.30031124],
        [  946.962421  ],
        [ 1380.62453075],
        [ 1814.2866405 ],
        [ 2247.94875026],
        [   79.40054994],
        [  513.06265969],
        [  946.72476944],
        [ 1380.3868792 ],
        [ 1814.04898895],
        [ 2247.71109871],
        [   79.36647401],
        [  513.02858377],
        [  946.69069352],
        [ 1380.35280328],
        [ 1814.01491303],
        [ 2247.67702279],
        [   79.18845532],
        [  512.85056508],
        [  946.51267483],
        [ 1380.17478459],
        [ 1813.83689434],
        [ 2247.4990041 ],
        [   79.5662901 ],
        [  513.22839985],
        [  946.8905096 ],
        [ 1380.55261936],
        [ 1814.21472911],
        [ 2247.87683887],
        [   79.68691566],
        [  513.34902541],
        [  947.01113516],
        [ 1380.67324492],
        [ 1814.33535467],
        [ 2247.99746443],
        [   79.28050649],
        [  512.94261625],
        [ 1380.26683575],
        [ 1813.92894551],
        [ 2247.59105526],
        [ 2572.83763758],
        [   79.32423907],
        [  512.98634882],
        [  946.64845858],
        [ 1380.31056833],
        [ 1813.97267809],
        [ 2247.63478784],
        [   79.66619814],
        [  513.32830789],
        [  946.99041764],
        [ 1380.6525274 ],
        [ 1814.31463715],
        [ 2247.97674691],
        [   79.29555605],
        [  512.9576658 ],
        [ 1380.28188531],
        [ 1813.94399507],
        [ 2247.60610482],
        [   79.25831976],
        [  512.92042952],
        [ 1380.24464903],
        [ 1813.90675878],
        [ 2247.56886854],
        [   79.56453031],
        [  513.22664006],
        [  946.88874981],
        [ 1380.55085957],
        [ 1814.21296932],
        [ 2247.87507908],
        [  187.68607781],
        [  621.34818756],
        [ 1055.01029732],
        [ 1488.67240707],
        [ 1922.33451683],
        [ 2355.99662658],
        [  187.81546232],
        [  621.47757208],
        [ 1055.13968183],
        [ 1488.80179158],
        [ 1922.46390134],
        [ 2356.12601109],
        [   79.38611166],
        [  513.04822142],
        [  946.71033117],
        [ 1380.37244093],
        [ 1814.03455068],
        [ 2247.69666043],
        [   79.36423428],
        [  513.02634404],
        [  946.68845379],
        [ 1380.35056355],
        [ 1814.0126733 ],
        [ 2247.67478305],
        [  187.71987376],
        [  621.38198352],
        [ 1055.04409327],
        [ 1488.70620303],
        [ 1922.36831278],
        [ 2356.03042254],
        [   79.24152897],
        [  512.90363873],
        [  946.56574848],
        [ 1380.22785823],
        [ 1813.88996799],
        [ 2247.55207774],
        [   79.28172416],
        [  512.94383391],
        [  946.60594367],
        [ 1380.26805342],
        [ 1813.93016318],
        [ 2247.59227293],
        [   79.5119366 ],
        [  513.17404636],
        [  946.83615611],
        [ 1380.49826586],
        [ 1814.16037562],
        [ 2247.82248537],
        [   79.30715489],
        [  512.96926465],
        [ 1380.29348416],
        [ 1813.95559391],
        [ 2247.61770366],
        [  187.81194274],
        [  621.4740525 ],
        [ 1055.13616225],
        [ 1488.79827201],
        [ 1922.46038176],
        [ 2356.12249151],
        [   79.42274336],
        [  513.08485312],
        [ 1380.40907262],
        [ 1814.07118238],
        [ 2247.73329213],
        [   79.32503897],
        [  512.98714873],
        [  946.64925848],
        [ 1380.31136824],
        [ 1813.97347799],
        [ 2247.63558775],
        [   79.28656358],
        [  512.94867333],
        [  946.61078309],
        [ 1380.27289284],
        [ 1813.9350026 ],
        [ 2247.59711235],
        [   79.54221298],
        [  513.20432273],
        [  946.86643249],
        [ 1380.52854224],
        [ 1814.19065199],
        [ 2247.85276175],
        [   79.40822481],
        [  513.07033456],
        [ 1380.39455407],
        [ 1814.05666383],
        [ 2247.71877358],
        [   79.22857052],
        [  512.89068028],
        [  946.55279003],
        [ 1380.21489978],
        [ 1813.87700954],
        [ 2247.53911929],
        [  187.52561702],
        [  621.18772677],
        [ 1054.84983653],
        [ 1488.51194628],
        [ 1922.17405603],
        [ 2355.83616579],
        [   79.36115465],
        [  513.02326441],
        [  946.68537416],
        [ 1380.34748391],
        [ 1814.00959367],
        [ 2247.67170342],
        [   79.48845941],
        [  513.15056917],
        [  946.81267892],
        [ 1380.47478867],
        [ 1814.13689843],
        [ 2247.79900818],
        [  187.62080562],
        [  621.28291538],
        [ 1054.94502513],
        [ 1488.60713489],
        [ 1922.26924464],
        [ 2355.93135439],
        [  187.76570828],
        [  621.42781803],
        [ 1055.08992779],
        [ 1488.75203754],
        [ 1922.41414729],
        [ 2356.07625705],
        [  187.60036807],
        [  621.26247782],
        [ 1054.92458758],
        [ 1488.58669733],
        [ 1922.24880709],
        [ 2355.91091684],
        [  187.90113207],
        [  621.56324182],
        [ 1055.22535158],
        [ 1488.88746133],
        [ 1922.54957108],
        [ 2356.21168084],
        [  188.01051897],
        [  621.67262873],
        [ 1055.33473848],
        [ 1488.99684824],
        [ 1922.65895799],
        [ 2356.32106774],
        [  187.69751644],
        [  621.35962619],
        [ 1055.02173595],
        [ 1488.6838457 ],
        [ 1922.34595546],
        [ 2356.00806521],
        [  187.56497231],
        [  621.22708206],
        [ 1054.88919182],
        [ 1488.55130157],
        [ 1922.21341132],
        [ 2355.87552108],
        [  187.65316175],
        [  621.3152715 ],
        [ 1054.97738126],
        [ 1488.63949101],
        [ 1922.30160077],
        [ 2355.96371052],
        [  187.72755285],
        [  621.3896626 ],
        [ 1055.05177235],
        [ 1488.71388211],
        [ 1922.37599186],
        [ 2356.03810162],
        [   79.49889816],
        [  513.16100792],
        [  946.82311767],
        [ 1380.48522743],
        [ 1814.14733718],
        [ 2247.80944693],
        [  188.26435611],
        [  621.92646586],
        [ 1055.58857562],
        [ 1489.25068537],
        [ 1922.91279513],
        [ 2139.74385   ],
        [ 2356.57490488],
        [   79.43690195],
        [  513.0990117 ],
        [ 1380.42323121],
        [ 1814.08534097],
        [ 2247.74745072],
        [  187.61692609],
        [  621.27903584],
        [ 1054.9411456 ],
        [ 1488.60325535],
        [ 1922.2653651 ],
        [ 2355.92747486],
        [  187.99048137],
        [  621.65259113],
        [ 1055.31470088],
        [ 1488.97681063],
        [ 1922.63892039],
        [ 2356.30103014],
        [  187.97900275],
        [  621.6411125 ],
        [ 1055.30322225],
        [ 1488.96533201],
        [ 1922.62744176],
        [ 2356.28955152],
        [  187.81994179],
        [  621.48205154],
        [ 1055.14416129],
        [ 1488.80627105],
        [ 1922.4683808 ],
        [ 2356.13049056],
        [  187.90717134],
        [  621.5692811 ],
        [ 1055.23139085],
        [ 1488.89350061],
        [ 1922.55561036],
        [ 2356.21772011],
        [  187.52645692],
        [  621.18856667],
        [ 1054.85067643],
        [ 1488.51278618],
        [ 1922.17489593],
        [ 2355.83700569],
        [  187.69659655],
        [  621.35870631],
        [ 1055.02081606],
        [ 1488.68292581],
        [ 1922.34503557],
        [ 2356.00714532],
        [  187.89005339],
        [  621.55216315],
        [ 1055.2142729 ],
        [ 1488.87638266],
        [ 1922.53849241],
        [ 2356.20060216],
        [  187.66260062],
        [  621.32471037],
        [ 1054.98682013],
        [ 1488.64892988],
        [ 1922.31103964],
        [ 2355.97314939],
        [  187.71175474],
        [  621.37386449],
        [ 1055.03597424],
        [ 1488.698084  ],
        [ 1922.36019375],
        [ 2356.02230351],
        [  187.67555907],
        [  621.33766882],
        [ 1054.99977858],
        [ 1488.66188833],
        [ 1922.32399809],
        [ 2355.98610784],
        [  187.67933491],
        [  621.34144467],
        [  946.58802698],
        [ 1055.00355442],
        [ 1055.00355442],
        [ 1055.00355442],
        [ 1488.66566418],
        [ 1922.32777393],
        [ 2355.98988369],
        [  187.72051369],
        [  621.38262344],
        [ 1055.0447332 ],
        [ 1488.70684295],
        [ 1922.3689527 ],
        [ 2356.03106246],
        [  187.78658578],
        [  621.44869553],
        [ 1055.11080529],
        [ 1488.77291504],
        [ 1922.4350248 ],
        [ 2356.09713455],
        [  187.69027731],
        [  621.35238706],
        [ 1055.01449682],
        [ 1488.67660657],
        [ 1922.33871632],
        [ 2356.00082608],
        [  187.65816115],
        [  621.32027091],
        [ 1054.98238066],
        [ 1488.64449041],
        [ 1922.30660017],
        [ 2355.96870992],
        [  187.62632496],
        [  621.28843472],
        [ 1054.95054447],
        [ 1488.61265423],
        [ 1922.27476398],
        [ 2355.93687373],
        [  187.69555668],
        [  621.35766643],
        [ 1055.01977618],
        [ 1488.68188594],
        [ 1922.34399569],
        [ 2356.00610545],
        [  296.3356821 ],
        [  729.99779186],
        [ 1163.65990161],
        [ 1597.32201137],
        [ 2030.98412112],
        [ 2464.64623087],
        [  187.78886551],
        [  621.45097526],
        [ 1055.11308501],
        [ 1488.77519477],
        [ 1922.43730452],
        [ 2356.09941428],
        [  187.88945346],
        [  621.55156322],
        [ 1055.21367297],
        [ 1488.87578273],
        [ 1922.53789248],
        [ 2356.20000224],
        [  187.56282047],
        [  621.22493022],
        [ 1054.88703997],
        [ 1488.54914973],
        [ 1922.21125948],
        [ 1922.21125948],
        [ 2139.04231436],
        [ 2355.87336924],
        [   79.5010579 ],
        [  513.16316766],
        [  946.82527741],
        [ 1380.48738717],
        [ 1814.14949692],
        [ 2247.81160668],
        [  187.91397053],
        [  621.57608028],
        [ 1055.23819004],
        [ 1488.90029979],
        [ 1922.56240955],
        [ 2356.2245193 ],
        [  295.93944953],
        [  729.60155929],
        [ 1163.26366904],
        [ 1596.9257788 ],
        [ 2030.58788855],
        [ 2464.2499983 ],
        [  187.92264949],
        [  621.58475925],
        [ 1055.246869  ],
        [ 1488.90897875],
        [ 1922.57108851],
        [ 2356.23319826],
        [  187.9259291 ],
        [  621.58803885],
        [ 1055.25014861],
        [ 1488.91225836],
        [ 1922.57436812],
        [ 2356.23647787],
        [  188.32116179],
        [  621.98327154],
        [ 1055.6453813 ],
        [ 1489.30749105],
        [ 1922.9696008 ],
        [ 2356.63171056],
        [  187.81842197],
        [  621.48053172],
        [ 1055.14264148],
        [ 1488.80475123],
        [ 1922.46686098],
        [ 2356.12897074],
        [  189.68223886],
        [  623.34434861],
        [ 1057.00645837],
        [ 1490.66856812],
        [ 1924.33067788],
        [ 2357.99278763],
        [  187.70383568],
        [  621.36594544],
        [ 1055.02805519],
        [ 1488.69016495],
        [ 1922.3522747 ],
        [ 2356.01438446],
        [   79.34999599],
        [  513.01210574],
        [  946.6742155 ],
        [ 1380.33632525],
        [ 1813.998435  ],
        [ 2247.66054476],
        [  187.75842003],
        [  621.42052979],
        [ 1055.08263954]]), None)

THE ONE THEIR GRADER FINALLY ACCEPTED


In [219]:
import numpy as np
import pandas
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""

def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    
    cost = (1.0/(2.0*m)) * (((features.dot(theta)) - values).T).dot(features.dot(theta) - values)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    cost_history = []
    
    for iteration in xrange(0,num_iterations):
        theta = theta - (alpha * (((1.0/m) * (features.dot(theta) - values).T.dot(features)).T))
        cost_history.append(compute_cost(features, values, theta))
    
    return theta, pandas.Series(cost_history)

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''
    # Select Features (try different features!)
    features = dataframe[['rain', 'precipi', 'Hour', 'meantempi']]
    
    # Add UNIT to features using dummy variables
    dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit')
    features = features.join(dummy_units)
    
    # Values
    values = dataframe[['ENTRIESn_hourly']]
    m = len(values)

    features, mu, sigma = normalize_features(features)
    features['ones'] = np.ones(m) # Add a column of 1s (y intercept)
    
    # Convert features and values to numpy arrays
    features_array = np.array(features)
    values_array = np.array(values).flatten()

    # Set values for alpha, number of iterations.
    alpha = 0.1 # please feel free to change this value
    num_iterations = 75 # please feel free to change this value

    # Initialize theta, perform gradient descent
    theta_gradient_descent = np.zeros(len(features.columns))
    theta_gradient_descent, cost_history = gradient_descent(features_array, 
                                                            values_array, 
                                                            theta_gradient_descent, 
                                                            alpha, 
                                                            num_iterations)
    
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    # plot = plot_cost_history(alpha, cost_history)
    # 
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed 
    # the 30 second limit on the compute servers.

    # calculate r^2 (after gradient descent) 
    r_squared = 1 - np.sum((np.square(values_array - np.dot(features_array, theta_gradient_descent))))/np.sum(np.square(values_array - np.mean(values_array)))
    print r_squared
    
    predictions = np.dot(features_array, theta_gradient_descent)
    return predictions, plot


def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pandas.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )