In [1]:
import pandas as pd

dataframe = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=100)

In [2]:
import numpy as np
import scipy
import pandas
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""

def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
    cost = sum_of_square_errors / (2.0*m)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    
    cost_history = []
    
    for iteration in xrange(0,num_iterations):

        new_theta = theta
        for j in xrange(0,len(theta)):
            #print j, features[:,j]
            new_theta[j] = theta[j] + ((1.0*alpha)/m) * ((values - np.dot(features, theta)).dot(features[:,j]))
        
        theta = new_theta
        
        cost_history.append(compute_cost(features, values, theta))
        
    return theta, pandas.Series(cost_history)

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''

    # Select Features (try different features!)
    #features = dataframe[['Hour', 'maxpressurei', 'maxdewpti', 'mindewpti', 'meandewpti', 'meanpressurei', 'mintempi', 'maxtempi']].copy()
    features = dataframe[['Hour', 'maxpressurei', 'maxtempi']].copy()
    
    # Add UNIT to features using dummy variables
    dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit')
    features = features.join(dummy_units)
    
    # Values
    values = dataframe[['ENTRIESn_hourly']]
    m = len(values)

    features, mu, sigma = normalize_features(features)
    features['ones'] = np.ones(m) # Add a column of 1s (y intercept)
    print features.head()
    
    # Convert features and values to numpy arrays
    features_array = np.array(features)
    values_array = np.array(values).flatten()

    # Set values for alpha, number of iterations.
    alpha = 0.1 # please feel free to change this value
    num_iterations = 100 # please feel free to change this value

    # Initialize theta, perform gradient descent
    theta_gradient_descent = np.zeros(len(features.columns))
    theta_gradient_descent, cost_history = gradient_descent(features_array, 
                                                            values_array, 
                                                            theta_gradient_descent, 
                                                            alpha, 
                                                            num_iterations)
    
    print theta_gradient_descent
    
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    plot = plot_cost_history(alpha, cost_history)
    # 
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed 
    # the 30 second limit on the compute servers.
    
    
    predictions = np.dot(features_array, theta_gradient_descent)
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(features, values)
    print str(1 - np.sum((np.square(values.values - predictions)))/np.sum(np.square(values.values - np.mean(values.values))))
    print r_value**2
    return predictions, plot


def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pandas.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )

In [3]:
predictions(dataframe)


       Hour  maxpressurei  maxtempi  unit_R001  unit_R002  unit_R003  \
0 -1.306898  2.269830e-08       NaN   3.938274  -0.251379  -0.228266   
1 -0.738681  2.269830e-08       NaN   3.938274  -0.251379  -0.228266   
2 -0.170465  2.269830e-08       NaN   3.938274  -0.251379  -0.228266   
3  0.397752  2.269830e-08       NaN   3.938274  -0.251379  -0.228266   
4  0.965968  2.269830e-08       NaN   3.938274  -0.251379  -0.228266   

   unit_R004  unit_R005  unit_R006  unit_R007    ...      unit_R010  \
0  -0.251379  -0.228266  -0.251379  -0.203101    ...      -0.251379   
1  -0.251379  -0.228266  -0.251379  -0.203101    ...      -0.251379   
2  -0.251379  -0.228266  -0.251379  -0.203101    ...      -0.251379   
3  -0.251379  -0.228266  -0.251379  -0.203101    ...      -0.251379   
4  -0.251379  -0.228266  -0.251379  -0.203101    ...      -0.251379   

   unit_R011  unit_R012  unit_R013  unit_R014  unit_R015  unit_R016  \
0  -0.251379  -0.272976  -0.251379  -0.251379  -0.251379  -0.228266   
1  -0.251379  -0.272976  -0.251379  -0.251379  -0.251379  -0.228266   
2  -0.251379  -0.272976  -0.251379  -0.251379  -0.251379  -0.228266   
3  -0.251379  -0.272976  -0.251379  -0.251379  -0.251379  -0.228266   
4  -0.251379  -0.272976  -0.251379  -0.251379  -0.251379  -0.228266   

   unit_R017  unit_R018  ones  
0  -0.251379  -0.203101     1  
1  -0.251379  -0.203101     1  
2  -0.251379  -0.203101     1  
3  -0.251379  -0.203101     1  
4  -0.251379  -0.203101     1  

[5 rows x 22 columns]
[ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-3-770d36682877> in <module>()
----> 1 predictions(dataframe)

<ipython-input-2-22464cd5528b> in predictions(dataframe)
    140 
    141     predictions = np.dot(features_array, theta_gradient_descent)
--> 142     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(features, values)
    143     print str(1 - np.sum((np.square(values.values - predictions)))/np.sum(np.square(values.values - np.mean(values.values))))
    144     print r_value**2

/usr/local/lib/python2.7/site-packages/scipy/stats/stats.pyc in linregress(x, y)
   3009 
   3010     # average sum of squares:
-> 3011     ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
   3012     r_num = ssxym
   3013     r_den = np.sqrt(ssxm*ssym)

/usr/local/lib/python2.7/site-packages/numpy/lib/function_base.pyc in cov(m, y, rowvar, bias, ddof)
   1881     if y is not None:
   1882         y = array(y, copy=False, ndmin=2, dtype=dtype)
-> 1883         X = concatenate((X, y), axis)
   1884 
   1885     X -= X.mean(axis=1-axis, keepdims=True)

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [ ]: