lin_reg.ipynb

An ipython notebook cataloging the process we went through to create a ridge regression model for hubway ridership.

Final working code


In [1]:
"""
Import necessary modules
"""

import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

from pprint import pprint

print "modules imported"


modules imported

In [ ]:
#load weather data
from weather_collection import *
weather = pickle.load(open('LargeDataStorage/weatherDataFile', 'rb'))

#load hubway data
from hubway_collection import *
hubway = pickle.load(open('LargeDataStorage/hubwayDataFile', 'rb'))

print "weather and hubway data loaded"

In [ ]:
def count_riders2(year, month, day, hour):
    """
    Input: year, month, day, hour
    Output: total riders during that hour
    """

    #initialize counter
    counter = 0

    #counts riders during a given hour
    for minute in range(0,60):
        #-1 means that there is no data for that time, so we don't count that
        if hubway.data[year][month][day][hour][minute] == -1:
            pass
        else:
            counter += len(hubway.data[year][month][day][hour][minute])
    return counter

In [ ]:
def process_data2():
    """
    Warning: hard-coded for hubway data from 2013
    Output: Array formatted array([year, month, day, hour, temp, precip, snow*, riders])
    Note: * data is binary, units are in imperial (english) units
    """
    
    year = 2013
    
    # not a leap year, also taking into account dates hubway was open
    # 2013 start = 4/2/2013
    # 2013 end = 11/30/2013
    numDaysInMonth = [29, 31, 30, 31, 31, 30, 31, 30]
    
    # initalize main list for data
    all_data = []
    
    for index in range(sum(numDaysInMonth)):
        # initalize list that will be appended to all_data
        curr_list = [year]

        for month in range(4, 6):
            for day in range(numDaysInMonth[month-4]):
                for hour in range(0,24):
                    # this is here to make sure that data for April starts on the 2nd
                    if month == 4:
                        tempi = int(float(weather.data[year][month][day+2][hour]['tempi']))
                        if int(float(weather.data[year][month][day+2][hour]['precipi'])) < 0:
                            precipi = 0
                        else: 
                            precipi = int(float(weather.data[year][month][day+2][hour]['precipi']))
                        snow = int(weather.data[year][month][day+2][hour]['snow'])
                        riders = count_riders2(year, month, day+2, hour)
                        curr_list = [year, month, day+2, hour, tempi, precipi, snow, riders]
                        all_data.append(curr_list)
                    else:
                        tempi = int(float(weather.data[year][month][day+1][hour]['tempi']))
                        if int(float(weather.data[year][month][day+1][hour]['precipi'])) < 0:
                            precipi = 0
                        else:
                            precipi = int(float(weather.data[year][month][day+1][hour]['precipi']))
                        snow = int(weather.data[year][month][day+1][hour]['snow'])
                        riders = count_riders2(year, month, day+1, hour)
                        curr_list = [year, month, day+1, hour, tempi, precipi, snow, riders]
                        all_data.append(curr_list)
    
    return np.array(all_data)

In [ ]:
data_array = process_data2()

In [ ]:
"""
Running a linear regression after transforming a polynomial 
into multiple coefficients so that a linear regression will work
"""

def lin_reg():
    
    year = 2013
    
    X = data_array[:,[1,2,3,4,5,6]]
    Y = data_array[:,7]

    # make array vertical so that scikit-learn can process it
    X = X.reshape(X.shape[0], -1)
    Y = Y.reshape(Y.shape[0], -1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
    
    for degrees in range(6, 11):
        model = make_pipeline(PolynomialFeatures(degrees), Ridge())

        model.fit(X_train, y_train)
        print "Year %d, %d degree polynomial regression" % (year, degrees)
        print "Train R2 %f"%model.score(X_train, y_train)
        print "Test R2 %f"%model.score(X_test, y_test)

#         y_plot = model.predict(X)
#         plt.plot(X, y_plot)
#         plt.show()

lin_reg()

Old code

This evolved into our model creation code.

We originally started out only correlating aggregate ridership vs. temperature. Our process data function would effectively return temperatures and the total number of hubway rides that happened at specific temperatures through a given year.

Our linear regression started off with a workaround. Because you cannot run a linear regression on a set of data that is not linear (our data at this time looked like a normal curve), we took the natural log of our data in order to make it more linear.

After that, we evolved to using PolynomialFeatures to break down our non-linear data into a form that can be analyzed by a linear regression (check scikit-learn documentation for how this occurs mathematically).

We also tried using a support vector regression, but chose not to continue with that code because our ridge regression code was working with no problems.


In [ ]:
def count_riders(year, month, day, hour):
    """
    Input: year, month, day, hour
    Output: total riders during that hour
    """

    #initialize counter
    counter = 0

    #counts riders during a given hour
    for minute in range(0,60):
        #-1 means that there is no data for that time, so we don't count that
        if hubway.data[year][month][day+1][hour][minute] == -1:
            pass
        else:
            counter += len(hubway.data[year][month][day+1][hour][minute])
    return counter

In [ ]:
def process_data(year):
    """
    Returns 2 lists, 1 of temperatures, 1 of associated ridership.
    """

    #determines whether or not it is a leap year
    if year % 4 == 0:
        numDaysInMonth = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    else:
        numDaysInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

    riders_vs_temp = {}

    #adds all hourly temperatures in given year to dictionary as keys with values of 0
    for m in range(1,13):
        for d in range(numDaysInMonth[m-1]):
            for h in range(0,24):
                if int(float(weather.data[year][m][d+1][h]['tempi'])) < -100:
                    pass
                else:
                    riders_vs_temp[int(float(weather.data[year][m][d+1][h]['tempi']))] = 0

    #adds number of riders to associated temperature in the dictionary
    for month in range(1,13):
        for day in range(numDaysInMonth[month-1]):
            for hour in range(24):
                if int(float(weather.data[year][month][day+1][hour]['tempi'])) < -100:
                    pass
                else:
                    riders_vs_temp[int(float(weather.data[year][month][day+1][hour]['tempi']))] += count_riders(year, month, day, hour)

    return riders_vs_temp.keys(), riders_vs_temp.values()

In [ ]:
"""
Straight up linear regression by taking the log of polynomial data to make it more resemble a linear dataset.
"""

def lin_reg(year):
    
    # import temperature and ridership data
    temperatures, ridership = process_data(year)
    
    # turn data into array
    temps = np.array(temperatures)
    rides = np.array(ridership)
    
    # remove negative values so that np.log will not return NaN
    for i in range(len(temps)):
        if temps[i] < 0:
            temps = np.delete(temps, i)
            rides = np.delete(rides, i)
    
    # take the log of the data so that you can use a linear regression
    temps = np.log(temps)
    rides = np.log(rides)
    
    # removes infinity from log and sets to 0
    for i in range(len(rides)):
        if rides[i] == -np.inf:
            rides[i] = 0
    
    for i in range(len(temps)):
        if temps[i] == -np.inf:
            temps[i] = 0
            
    # make array vertical so that scikit-learn can process it
    temps = temps.reshape(temps.shape[0], -1)
    rides = rides.reshape(rides.shape[0], -1)
    
    X_train, X_test, y_train, y_test = train_test_split(temps, rides, train_size=0.5)

    model = LinearRegression()
    model.fit(X_train, y_train)
    print year
    print "Train R2 %f"%model.score(X_train, y_train)
    print "Test R2 %f"%model.score(X_test, y_test)

lin_reg(2011)
lin_reg(2012)
lin_reg(2013)

In [ ]:
"""
Running a linear regression after transforming a polynomial 
into multiple coefficients so that a linear regression will work
"""

def lin_reg(year):
    
    # import temperature and ridership data
    temperatures, ridership = process_data(year)
    
    # turn data into array
    temps = np.array(temperatures)
    rides = np.array(ridership)

    # remove negative values so that np.log will not return NaN
    for i in range(len(temps)):
        if temps[i] < 0:
            temps = np.delete(temps, i)
            rides = np.delete(rides, i)

    # make array vertical so that scikit-learn can process it
    temps = temps.reshape(temps.shape[0], -1)
    rides = rides.reshape(rides.shape[0], -1)
    
    X_train, X_test, y_train, y_test = train_test_split(temps, rides, train_size=0.5)
    
    for degrees in range(3, 11):
        model = make_pipeline(PolynomialFeatures(degrees), Ridge())

        model.fit(X_train, y_train)
        print "Year %d, %d degree polynomial regression" % (year, degrees)
        print "Train R2 %f"%model.score(X_train, y_train)
        print "Test R2 %f"%model.score(X_test, y_test)

        y_plot = model.predict(temps)
        plt.plot(temps, y_plot)
        plt.show()

lin_reg(2011)
lin_reg(2012)
lin_reg(2013)

In [ ]:
"""
Trying to use a Support Vector Regression (a type of Support Vector Machine)
Does not currently work.
"""

def sup_vec(year):
        
    # import temperature and ridership data
    temperatures, ridership = process_data(year)
    
    # turn data into array
    temps = np.array(temperatures)
    rides = np.array(ridership)

    # remove negative values so that np.log will not return NaN
    for i in range(len(temps)):
        if temps[i] < 0:
            temps = np.delete(temps, i)
            rides = np.delete(rides, i)

    # make array vertical so that scikit-learn can process it
    temps = temps.reshape(temps.shape[0], -1)
    rides = rides.reshape(rides.shape[0], -1)
    
    X_train, X_test, y_train, y_test = train_test_split(temps, rides, train_size=0.5)
    
    model = SVR()
    model.fit(X_train, y_train, sample_weight=None)
    print year
    print "Train R2 %f"%model.score(X_train, y_train)
    print "Test R2 %f"%model.score(X_test, y_test)

sup_vec(2011)
sup_vec(2012)
sup_vec(2013)