In [ ]:

    
import numpy as np
import scipy.stats as stats
import pandas as pd

def compare_averages(filename):
    """
    Performs a t-test on two sets of baseball data (left-handed and right-handed hitters).

    You will be given a csv file that has three columns.  A player's
    name, handedness (L for lefthanded or R for righthanded) and their
    career batting average (called 'avg'). You can look at the csv
    file via the following link:
    https://www.dropbox.com/s/xcn0u2uxm8c4n6l/baseball_data.csv
    
    Write a function that will read that the csv file into a pandas data frame,
    and run Welch's t-test on the two cohorts defined by handedness.
    
    One cohort should be a data frame of right-handed batters. And the other
    cohort should be a data frame of left-handed batters.
    
    We have included the scipy.stats library to help you write
    or implement Welch's t-test:
    http://docs.scipy.org/doc/scipy/reference/stats.html
    
    With a significance level of 95%, if there is no difference
    between the two cohorts, return a tuple consisting of
    True, and then the tuple returned by scipy.stats.ttest.  
    
    If there is a difference, return a tuple consisting of
    False, and then the tuple returned by scipy.stats.ttest.
    
    For example, the tuple that you return may look like:
    (True, (9.93570222, 0.000023))
    """
    df = pd.read_csv(filename)
    rh = df[df['handedness'] == 'R']
    lh = df[df['handedness'] == 'L']
    
    return (False, stats.ttest_ind(rh['avg'],lh['avg'],equal_var=False))



In [ ]:

    
import numpy as np
import pandas as pd

def compute_cost(features, values, theta):
    """
    Compute the cost of a list of parameters, theta, given a list of features 
    (input data points) and values (output data points).
    """
    m = len(values)
    sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
    cost = sum_of_square_errors / (2*m)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    """

    # Write code here that performs num_iterations updates to the elements of theta.
    # times. Every time you compute the cost for a given list of thetas, append it 
    # to cost_history.
    # See the Instructor notes for hints. 
    
    m = len(values)
    
    cost_history = []
    
    for iteration in xrange(0,num_iterations):

        new_theta = theta
        for j in xrange(0,len(theta)):
            new_theta[j] = theta[j] + (1.0*alpha/m) * (values - np.dot(features, theta)).dot(features[:,j])
        
        theta = new_theta
        
        cost_history.append(compute_cost(features, values, theta))

    return theta, pd.Series(cost_history) # leave this line for the grader



In [ ]:

    
import numpy as np

def compute_r_squared(data, predictions):
    # Write a function that, given two input numpy arrays, 'data', and 'predictions,'
    # returns the coefficient of determination, R^2, for the model that produced 
    # predictions.
    # 
    # Numpy has a couple of functions -- np.mean() and np.sum() --
    # that you might find useful, but you don't have to use them.

    r_squared = 1 - np.sum((np.square(data - predictions)))/np.sum(np.square(data - np.mean(data)))

    return r_squared



In [ ]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def entries_histogram(turnstile_weather):
    '''
    Before we perform any analysis, it might be useful to take a
    look at the data we're hoping to analyze. More specifically, let's 
    examine the hourly entries in our NYC subway data and determine what
    distribution the data follows. This data is stored in a dataframe
    called turnstile_weather under the ['ENTRIESn_hourly'] column.
    
    Let's plot two histograms on the same axes to show hourly
    entries when raining vs. when not raining. Here's an example on how
    to plot histograms with pandas and matplotlib:
    turnstile_weather['column_to_graph'].hist()
    
    Your histograph may look similar to bar graph in the instructor notes below.
    
    You can read a bit about using matplotlib and pandas to plot histograms here:
    http://pandas.pydata.org/pandas-docs/stable/visualization.html#histograms
    
    You can see the information contained within the turnstile weather data here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    '''


    norain_entries = turnstile_weather[turnstile_weather['rain'] == 0]
    rain_entries = turnstile_weather[turnstile_weather['rain'] == 1]
    
    norain_entries['ENTRIESn_hourly'].hist(bins=24, range=(0,6000))
    rain_entries['ENTRIESn_hourly'].hist(bins=20, range=(0,6000), alpha=0.8)
    
    plt.title('Histogram of ENTRIESn_hourly')
    plt.xlabel('ENTRIESn_hourly')
    plt.ylabel('Frequency')
    plt.legend(["No Rain", "Rain"])
    
    
    return plt



In [ ]:

    
import numpy as np
import scipy
import scipy.stats
import pandas

def mann_whitney_plus_means(turnstile_weather):
    '''
    This function will consume the turnstile_weather dataframe containing
    our final turnstile weather data. 
    
    You will want to take the means and run the Mann Whitney U-test on the 
    ENTRIESn_hourly column in the turnstile_weather dataframe.
    
    This function should return:
        1) the mean of entries with rain
        2) the mean of entries without rain
        3) the Mann-Whitney U-statistic and p-value comparing the number of entries
           with rain and the number of entries without rain
    
    You should feel free to use scipy's Mann-Whitney implementation, and you 
    might also find it useful to use numpy's mean function.
    
    Here are the functions' documentation:
    http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html
    
    You can look at the final turnstile weather data at the link below:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    '''
    
    without_rain_mean = turnstile_weather[turnstile_weather['rain'] == 0]['ENTRIESn_hourly'].mean()
    with_rain_mean = turnstile_weather[turnstile_weather['rain'] == 1]['ENTRIESn_hourly'].mean()
    
    U,p = scipy.stats.mannwhitneyu(turnstile_weather[turnstile_weather['rain'] == 0]['ENTRIESn_hourly'], turnstile_weather[turnstile_weather['rain'] == 1]['ENTRIESn_hourly'])
    
    return with_rain_mean, without_rain_mean, U, p # leave this line for the grader

Per the instructions, I calculated the mean of entries with rain, the mean of entries without rain, and the Mann-Whitney U-statistic and p-value comparing the number of entries with rain and the number of entries without rain. As the U value is extremely high, and the P value is quite low (~0.025), it seems reasonable at this time to reject the null hypothesis that the distributions of both rainy days and non-rainy days are identical.

3.5



In [53]:

    
import numpy as np
import pandas as pd
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.
"""
def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

dataframe = pd.read_csv('turnstile_data_master_with_weather.csv')
#dataframe = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=1000) ##### UNCOMMENT FOR SUBMISSION
dataframe.head()
dataframe.describe()









    Out[53]:






  
    
      
      Unnamed: 0
      Hour
      ENTRIESn_hourly
      EXITSn_hourly
      maxpressurei
      maxdewpti
      mindewpti
      minpressurei
      meandewpti
      meanpressurei
      fog
      rain
      meanwindspdi
      mintempi
      meantempi
      maxtempi
      precipi
      thunder
    
  
  
    
      count
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951.000000
       131951
    
    
      mean
        65975.000000
           10.896158
         1095.348478
          886.890838
           30.031894
           57.241302
           48.259013
           29.892714
           52.703526
           29.965077
            0.167100
            0.334245
            5.543065
           56.169775
           64.269729
           71.769968
            0.172276
            0
    
    
      std
        38091.117022
            6.892084
         2337.015421
         2008.604886
            0.125689
            8.770891
           11.305312
            0.146384
            9.943590
            0.130461
            0.373066
            0.471728
            1.982441
            6.338875
            6.568289
            7.627218
            0.429005
            0
    
    
      min
            0.000000
            0.000000
            0.000000
            0.000000
           29.740000
           39.000000
           22.000000
           29.540000
           31.000000
           29.640000
            0.000000
            0.000000
            1.000000
           46.000000
           55.000000
           58.000000
            0.000000
            0
    
    
      25%
        32987.500000
            5.000000
           39.000000
           32.000000
           29.960000
           50.000000
           38.000000
           29.840000
           45.000000
           29.910000
            0.000000
            0.000000
            5.000000
           52.000000
           60.000000
           65.000000
            0.000000
            0
    
    
      50%
        65975.000000
           12.000000
          279.000000
          232.000000
           30.030000
           57.000000
           51.000000
           29.910000
           54.000000
           29.960000
            0.000000
            0.000000
            5.000000
           54.000000
           63.000000
           71.000000
            0.000000
            0
    
    
      75%
        98962.500000
           17.000000
         1109.000000
          847.000000
           30.100000
           64.000000
           55.000000
           29.970000
           60.000000
           30.050000
            0.000000
            1.000000
            6.000000
           60.000000
           68.000000
           78.000000
            0.100000
            0
    
    
      max
       131950.000000
           23.000000
        51839.000000
        45249.000000
           30.310000
           70.000000
           66.000000
           30.230000
           68.000000
           30.270000
            1.000000
            1.000000
           12.000000
           70.000000
           78.000000
           86.000000
            2.180000
            0



In [26]:

    
features = dataframe[['rain', 'precipi', 'Hour', 'meantempi']]
#features = dataframe[['rain', 'precipi', 'meanwindspdi', 'thunder']]



In [27]:

    
# Add UNIT to features using dummy variables
#dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
#features = features.join(dummy_units)
   
# Values
values = dataframe[['ENTRIESn_hourly']]
m = len(values)

features, mu, sigma = normalize_features(features)
features['ones'] = np.ones(m) # Add a column of 1s (y intercept)

# Convert features and values to numpy arrays
features_array = np.array(features)
values_array = np.array(values)



In [34]:

    
# Set values for alpha, number of iterations.
alpha = 0.1 # please feel free to change this value
num_iterations = 75 # please feel free to change this value

# Initialize theta, perform gradient descent
theta_gradient_descent = np.zeros((len(features.columns),1))



In [37]:

    
def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    m = len(values)
    cost = (1.0/(2*m)) * (((features.dot(theta)) - values).T).dot(features.dot(theta) - values)
    
    return cost

#cost = compute_cost(features_array, values_array, theta_gradient_descent)



In [44]:

    
# delta-vector function for derivatives
def deltas(features, values, theta):
    m = len(values)
    n = len(features[0])

    delta = np.zeros((n,1))
    for j in xrange(0,n):
        summation = 0
        for i in xrange(0,m):
            summation += ((theta.T.dot(features[i]) - values[i]) * features[i][j])
                
        delta[j] = (1.0/m) * summation
    
    return delta

#deltas(features_array, values_array, theta_gradient_descent)



In [1]:

    
def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """
    m = len(values)
    cost_history = []

    for iteration in xrange(0,num_iterations):
        theta = theta - (alpha * deltas(features, values, theta))

        cost_history.append(compute_cost(features, values, theta))
    
    return theta, pd.Series(cost_history)

theta_gradient_descent, cost_history = gradient_descent(features_array, values_array, theta_gradient_descent, alpha, num_iterations)
#theta_gradient_descent = gradient_descent(features_array, values_array, theta_gradient_descent, alpha, num_iterations)
#theta_gradient_descent









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-1c831abdc65b> in <module>()
     16     return theta, pd.Series(cost_history)
     17 
---> 18 theta_gradient_descent, cost_history = gradient_descent(features_array, values_array, theta_gradient_descent, alpha, num_iterations)
     19 #theta_gradient_descent = gradient_descent(features_array, values_array, theta_gradient_descent, alpha, num_iterations)
     20 #theta_gradient_descent

NameError: name 'features_array' is not defined



In [2]:

    
def predictions():
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''
    
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    plot = plot_cost_history(alpha, cost_history)
    # 
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed 
    # the 30 second limit on the compute servers.
    
    predictions = np.dot(features_array, theta_gradient_descent)
    return predictions, plot

#predictions()



In [20]:

    
def plot_cost_history(alpha, cost_history):
    """This function is for viewing the plot of your cost history. You can run it by uncommenting this plot_cost_history(alpha, cost_history) call in predictions. If you want to run this locally, you should print the return value from this function."""
    
    cost_df = pd.DataFrame({'Cost_History': cost_history, 'Iteration': range(len(cost_history))})
    
    return_value = ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )

    return return_value

plot_cost_history(alpha, cost_history)









    Out[20]:





<ggplot: (282030649)>

using code submitted for exercises



In [4]:

    
import pandas as pd
dataframe = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=1000)



In [51]:

    
import numpy as np
import pandas as pd
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""

def normalize_features(array):
   """
   Normalize the features in the data set.
   """
   array_normalized = (array-array.mean())/array.std()
   mu = array.mean()
   sigma = array.std()

   return array_normalized, mu, sigma

def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    
    This can be the same code as the compute_cost function in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
    cost = sum_of_square_errors / (2*m)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    
    This can be the same gradient descent code as in the lesson #3 exercises,
    but feel free to implement your own.
    """
    
    m = len(values)
    cost_history = []

    for i in range(num_iterations):
        new_theta = theta
        for j in xrange(0,len(theta)):
            new_theta[j] = theta[j] + (1.0*alpha/m) * (values - np.dot(features, theta)).dot(features[:,j])
        
        theta = new_theta
        
        cost_history.append(compute_cost(features, values, theta))
    return theta, pd.Series(cost_history)

def predictions(dataframe):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
    
    You can download the complete turnstile weather dataframe here:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv    
    
    Your prediction should have a R^2 value of 0.20 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    
    Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will
    give you a random subet (~15%) of the data contained in 
    turnstile_data_master_with_weather.csv. You are encouraged to experiment with 
    this computer on your own computer, locally. 
    
    
    If you'd like to view a plot of your cost history, uncomment the call to 
    plot_cost_history below. The slowdown from plotting is significant, so if you 
    are timing out, the first thing to do is to comment out the plot command again.
    
    If you receive a "server has encountered an error" message, that means you are 
    hitting the 30-second limit that's placed on running your program. Try using a 
    smaller number for num_iterations if that's the case.
    
    If you are using your own algorithm/models, see if you can optimize your code so 
    that it runs faster.
    '''
    # Select Features (try different features!)
    features = dataframe[['rain', 'precipi', 'Hour', 'meantempi']]
    
    # Add UNIT to features using dummy variables
    #dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    #features = features.join(dummy_units)
    
    # Values
    values = dataframe[['ENTRIESn_hourly']]
    m = len(values)

    #features, mu, sigma = normalize_features(features)
    features['ones'] = pd.Series(np.ones(m)) # Add a column of 1s (y intercept)
    
    print features
    # Convert features and values to numpy arrays
    features_array = np.array(features)
    values_array = np.array(values).flatten()
    print "features", features_array
    print "values", values_array

    # Set values for alpha, number of iterations.
    alpha = 0.1 # please feel free to change this value
    num_iterations = 75 # please feel free to change this value

    # Initialize theta, perform gradient descent
    theta_gradient_descent = np.zeros(len(features.columns))
    theta_gradient_descent, cost_history = gradient_descent(features_array, 
                                                            values_array, 
                                                            theta_gradient_descent, 
                                                            alpha, 
                                                            num_iterations)
    
    #print "theta", theta_gradient_descent
    #print "cost", cost_history
    
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    #plot = plot_cost_history(alpha, cost_history)
    # 
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed 
    # the 30 second limit on the compute servers.
    
    predictions = np.dot(features_array, theta_gradient_descent)
    return predictions, plot


def plot_cost_history(alpha, cost_history):
   """This function is for viewing the plot of your cost history.
   You can run it by uncommenting this

       plot_cost_history(alpha, cost_history) 

   call in predictions.
   
   If you want to run this locally, you should print the return value
   from this function.
   """
   cost_df = pd.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )



In [52]:

    
predictions(dataframe)









    



      rain  precipi  Hour  meantempi  ones
0        0        0     1         60     1
1        0        0     5         60     1
2        0        0     9         60     1
3        0        0    13         60     1
4        0        0    17         60     1
5        0        0    21         60     1
6        0        0     1         60     1
7        0        0     5         60     1
8        0        0     9         60     1
9        0        0    13         60     1
10       0        0    17         60     1
11       0        0    21         60     1
12       0        0     0         60     1
13       0        0     4         60     1
14       0        0    12         60     1
15       0        0    16         60     1
16       0        0    20         60     1
17       0        0     0         60     1
18       0        0     4         60     1
19       0        0     8         60     1
20       0        0    12         60     1
21       0        0    16         60     1
22       0        0    20         60     1
23       0        0     0         60     1
24       0        0     4         60     1
25       0        0    12         60     1
26       0        0    16         60     1
27       0        0    20         60     1
28       0        0     0         60     1
29       0        0     4         60     1
...    ...      ...   ...        ...   ...
971      0        0    17         60     1
972      0        0    21         60     1
973      0        0     1         60     1
974      0        0     5         60     1
975      0        0     9         60     1
976      0        0    13         60     1
977      0        0    17         60     1
978      0        0    21         60     1
979      0        0     1         60     1
980      0        0     5         60     1
981      0        0     9         60     1
982      0        0    13         60     1
983      0        0    17         60     1
984      0        0    21         60     1
985      0        0     1         60     1
986      0        0     5         60     1
987      0        0     9         60     1
988      0        0    13         60     1
989      0        0    17         60     1
990      0        0    21         60     1
991      0        0     0         60     1
992      0        0     4         60     1
993      0        0     8         60     1
994      0        0    12         60     1
995      0        0    16         60     1
996      0        0    20         60     1
997      0        0     1         60     1
998      0        0     5         60     1
999      0        0     9         60     1
ones   NaN      NaN   NaN        NaN   NaN

[1001 rows x 5 columns]
features [[  0.   0.   1.  60.   1.]
 [  0.   0.   5.  60.   1.]
 [  0.   0.   9.  60.   1.]
 ..., 
 [  0.   0.   5.  60.   1.]
 [  0.   0.   9.  60.   1.]
 [ nan  nan  nan  nan  nan]]
values [   0.  217.  890. ...,  118.  669.   nan]






    Out[52]:





(array([ nan,  nan,  nan, ...,  nan,  nan,  nan]), None)



In [ ]:

    
import numpy as np
import scipy
import matplotlib.pyplot as plt

def plot_residuals(turnstile_weather, predictions):
    '''
    Using the same methods that we used to plot a histogram of entries
    per hour for our data, why don't you make a histogram of the residuals
    (that is, the difference between the original hourly entry data and the predicted values).

    Based on this residual histogram, do you have any insight into how our model
    performed?  Reading a bit on this webpage might be useful:

    http://www.itl.nist.gov/div898/handbook/pri/section2/pri24.htm
    '''
    
    (turnstile_weather['ENTRIESn_hourly'] - predictions).hist()
    return plt



In [ ]:

    
import numpy as np
import scipy
import matplotlib.pyplot as plt
import sys

def compute_r_squared(data, predictions):
    '''
    In exercise 5, we calculated the R^2 value for you. But why don't you try and
    and calculate the R^2 value yourself.
    
    Given a list of original data points, and also a list of predicted data points,
    write a function that will compute and return the coefficient of determination (R^2)
    for this data.  numpy.mean() and numpy.sum() might both be useful here, but
    not necessary.

    Documentation about numpy.mean() and numpy.sum() below:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html
    '''
    
    r_squared = 1 - np.sum((np.square(data - predictions)))/np.sum(np.square(data - np.mean(data)))
    
    return r_squared



In [3]:

    
import pandas as pd
weather_turnstile = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=1000)



In [39]:

    
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.api as sm

"""
In this optional exercise, you should complete the function called 
predictions(turnstile_weather). This function takes in our pandas 
turnstile weather dataframe, and returns a set of predicted ridership values,
based on the other information in the dataframe.  

In exercise 3.5 we used Gradient Descent in order to compute the coefficients
theta used for the ridership prediction. Here you should attempt to implement 
another way of computing the coeffcients theta. You may also try using a reference implementation such as: 
http://statsmodels.sourceforge.net/devel/generated/statsmodels.regression.linear_model.OLS.html

One of the advantages of the statsmodels implementation is that it gives you
easy access to the values of the coefficients theta. This can help you infer relationships 
between variables in the dataset.

You may also experiment with polynomial terms as part of the input variables.  

The following links might be useful: 
http://en.wikipedia.org/wiki/Ordinary_least_squares
http://en.wikipedia.org/w/index.php?title=Linear_least_squares_(mathematics)
http://en.wikipedia.org/wiki/Polynomial_regression

This is your playground. Go wild!

How does your choice of linear regression compare to linear regression
with gradient descent computed in Exercise 3.5?

You can look at the information contained in the turnstile_weather dataframe below:
https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv

Note: due to the memory and CPU limitation of our amazon EC2 instance, we will
give you a random subset (~10%) of the data contained in turnstile_data_master_with_weather.csv

If you receive a "server has encountered an error" message, that means you are hitting 
the 30 second limit that's placed on running your program. See if you can optimize your code so it
runs faster.
"""

def predictions(weather_turnstile):
    # initial data vectors
    x = weather_turnstile[['Hour', 'maxpressurei', 'maxdewpti', 'mindewpti', 'meandewpti', 'meanpressurei', 'mintempi', 'maxtempi']]
    y = weather_turnstile['ENTRIESn_hourly'].values

    # training sample size
    m = len(y)

    # proper vector shape
    y.shape = (m,1)

    # Add UNIT to features using dummy variables
    dummy_units = pd.get_dummies(weather_turnstile['UNIT'], prefix='unit')
    x = x.join(dummy_units)
    x = x.values

    # number of features
    n = len(x[0,:])

    # design matrix
    X = np.hstack((np.ones((m,1)),x))
    
    model = sm.OLS(y,X)
    results = model.fit()
    prediction = model.predict(results.params)
    
    return prediction



In [40]:

    
prediction = predictions(weather_turnstile)

	Unnamed: 0	Hour	ENTRIESn_hourly	EXITSn_hourly	maxpressurei	maxdewpti	mindewpti	minpressurei	meandewpti	meanpressurei	fog	rain	meanwindspdi	mintempi	meantempi	maxtempi	precipi	thunder
count	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951.000000	131951
mean	65975.000000	10.896158	1095.348478	886.890838	30.031894	57.241302	48.259013	29.892714	52.703526	29.965077	0.167100	0.334245	5.543065	56.169775	64.269729	71.769968	0.172276	0
std	38091.117022	6.892084	2337.015421	2008.604886	0.125689	8.770891	11.305312	0.146384	9.943590	0.130461	0.373066	0.471728	1.982441	6.338875	6.568289	7.627218	0.429005	0
min	0.000000	0.000000	0.000000	0.000000	29.740000	39.000000	22.000000	29.540000	31.000000	29.640000	0.000000	0.000000	1.000000	46.000000	55.000000	58.000000	0.000000	0
25%	32987.500000	5.000000	39.000000	32.000000	29.960000	50.000000	38.000000	29.840000	45.000000	29.910000	0.000000	0.000000	5.000000	52.000000	60.000000	65.000000	0.000000	0
50%	65975.000000	12.000000	279.000000	232.000000	30.030000	57.000000	51.000000	29.910000	54.000000	29.960000	0.000000	0.000000	5.000000	54.000000	63.000000	71.000000	0.000000	0
75%	98962.500000	17.000000	1109.000000	847.000000	30.100000	64.000000	55.000000	29.970000	60.000000	30.050000	0.000000	1.000000	6.000000	60.000000	68.000000	78.000000	0.100000	0
max	131950.000000	23.000000	51839.000000	45249.000000	30.310000	70.000000	66.000000	30.230000	68.000000	30.270000	1.000000	1.000000	12.000000	70.000000	78.000000	86.000000	2.180000	0