In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
%matplotlib inline

In [96]:
weather = pd.read_table('daily_weather.tsv')
usage = pd.read_table('usage_2012.tsv')
stations = pd.read_table('stations.tsv')
  1. To start with, we'll need to compute the number of rentals per station per day. Use pandas to do that.

In [97]:
station_counts = usage.groupby('station_start')['station_start'].count()
station_rentals_per_day = DataFrame()
station_rentals_per_day['rentals'] = station_counts.values / 366.0
station_rentals_per_day['station'] = station_counts.index

In [98]:
station_rentals_per_day.head()


Out[98]:
rentals station
0 17.289617 10th & E St NW
1 10.418033 10th & Monroe St NE
2 71.243169 10th & U St NW
3 55.773224 10th St & Constitution Ave NW
4 35.185792 11th & H St NE
  1. a. Our stations data has a huge number of quantitative attributes: fast_food, parking, restaurant, etc... Some of them are encoded as 0 or 1 (for absence or presence), others represent counts. To start with, run a simple linear regression where the input (x) variables are all the various station attributes and the output (y) variable is the average number of rentals per day.

In [99]:
s = stations[['station']]
u = pd.concat([usage['station_start']], axis=1, keys=['station'])
counts = u['station'].value_counts()
c = DataFrame(counts.index, columns=['station'])
c['counts'] = counts.values
c['counts'] = c['counts'].apply(lambda x: x / 366)
m = pd.merge(s, c, on='station')
stations_data = stations.merge(m, on='station')

In [100]:
df = DataFrame(stations_data.index, columns=['station'])
df['avg_rentals'] = m[['counts']]
df['station'] = m[['station']]
stations_vals = pd.merge(left=df, right=stations, on='station')

In [101]:
x = stations_vals[list(stations_vals.columns.values[8:])]
y = stations_vals[list(stations_vals.columns.values[1:2])]
linear_regression = linear_model.LinearRegression()
linear_regression.fit(x, y)


Out[101]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
  1. b. Plot the predicted values (model.predict(x)) against the actual values and see how they compare.

In [102]:
plt.scatter(linear_regression.predict(x), y)
plt.xlabel('predicted values')
plt.ylabel('actual values')
plt.show()


  1. c. In this case, there are 129 input variables and only 185 rows which means we're very likely to overfit. Look at the model coefficients and see if anything jumps out as odd.

In [103]:
linear_regression.coef_


Out[103]:
array([[  2.33163477e+00,  -2.58798372e-01,   7.38677614e-02,
         -6.23427298e+01,   2.02686671e+00,  -4.16148234e+00,
          5.55342631e+00,   1.78810014e+00,  -3.64436832e-01,
          1.46056713e-12,   2.38199452e+00,   6.88744836e+01,
          4.53635614e+00,  -1.04894937e-11,  -2.95916174e-12,
          5.24210306e+00,  -6.00994500e-12,   4.39207876e+00,
          9.26717502e+00,  -3.03514381e+00,   3.19641175e+00,
         -3.75470868e+01,   2.78908576e+01,  -3.85399854e+01,
         -1.58860736e+01,   2.17147435e+01,   1.50931200e+00,
          3.01466161e+00,  -4.32875603e+00,   8.85951246e+00,
          2.17147435e+01,  -2.71793215e-01,   3.44540930e+00,
          1.28336434e+01,   4.84020690e+00,  -2.04169817e+00,
         -3.45267737e+01,  -1.61959403e+01,   5.26641596e+00,
          5.16406843e+00,   1.13106239e+01,  -5.95501207e+00,
          1.47321423e+02,  -5.49234411e+00,  -1.29090263e+01,
          1.82647435e+02,  -9.14848559e-02,   3.95779169e-01,
         -6.87707451e+00,  -8.79545330e+00,   7.68222101e+01,
          5.86840001e+00,  -6.82144585e+00,  -1.81890985e+01,
          3.29939558e+01,   9.84610988e+00,   2.39896883e+01,
          5.46618714e+01,   1.50521662e+01,   5.46288475e+00,
          1.20106526e+01,  -1.81418486e+01,  -2.90273966e+00,
         -2.48860730e+01,  -2.65368604e+01,   1.78832981e+01,
         -2.55713270e+01,  -1.64291473e+01,  -1.65990282e+01,
         -2.04710247e+01,   4.54623146e+01,  -8.51557316e+01,
         -1.96379535e+01,   1.23740873e+01,   1.54615583e+00,
          3.40789783e+01,   3.40789783e+01,  -1.43689678e+01,
          3.71609277e+01,  -3.79488690e+00,  -3.79488690e+00,
          1.00390599e+02,   5.26741753e-15,  -8.14645916e+00,
          1.18039812e+01,   2.71335491e-02,   2.71335491e-02,
         -1.11480614e+01,   4.28200683e-14,  -7.04519636e-15,
          2.89463151e+01,   1.41436082e-14,   1.58308163e+01,
         -4.92020800e+01,  -1.62320599e-15,   1.38050296e-14,
         -5.94424353e-15,  -1.64519249e-14,   4.81269638e-15,
          7.09583401e-15,  -8.31816655e-15,   2.10652855e-14,
          4.46775687e-02,   3.52388526e-01,  -1.09254290e+00,
         -5.89216326e-01,   4.04781121e+01,   0.00000000e+00,
         -3.82194484e-01,  -4.82615546e+00,   3.25702981e+01,
          1.23660306e+01,  -3.48964379e+01,  -5.39321341e+00,
          1.71561409e+01,   0.00000000e+00,   3.78601422e+00,
         -1.02153895e+00,  -1.90777295e+00,  -6.23585766e+00,
         -1.09658842e+00,  -9.48229196e+01,   1.65238016e+00,
         -2.21991544e+01,  -2.24617220e+00,  -1.84557760e+01,
         -2.10465754e+01,  -3.37190472e+01,   7.18158688e+00]])

Outlier coeeficents would be inacurate at predicting.

  1. d. Go back and split the data into a training set and a test set. Train the model on the training set and evaluate it on the test set. How does it do?

In [104]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
lin_regr = linear_model.LinearRegression()
lin_regr.fit(x_train, y_train)


Out[104]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [105]:
plt.scatter(lin_regr.predict(x_test), y_test)
plt.xlabel('predicted values')
plt.ylabel('actual values')
plt.show()



In [106]:
plt.scatter(y_test, lin_regr.predict(x_test) )


Out[106]:
<matplotlib.collections.PathCollection at 0x115bfb090>

Too many outliers. Would be innacurate.

  1. a. Since we have so many variables, this is a good candidate for regularization. In particular, since we'd like to eliminate a lot of them, lasso seems like a good candidate. Build a lasso model on your training data for various values of alpha. Which variables survive?

In [107]:
model = Lasso(alpha=.1)
model.fit(x_train, y_train)
np.round(model.coef_, 1)


Out[107]:
array([  2.5,  -1.7,   0.5,   0. ,  -3.8,  -0.2,   8. ,   0. ,  -0.1,
         0. ,  -0.6,   0. ,  -1.1,   0. ,   0. ,   0. ,   0. ,  -4.9,
         0. ,  -0.1,  -0.1,   6.9,   5.1, -10.6,  -0. ,  -0. ,  -3.1,
         3.3,   7.2,   0.5,  -0. ,  -0. ,   1.7,   7.2,   3.5,   0. ,
        -0. , -10.7,  -0. ,  -0. ,  -6.9,   3.4,  66.1,   2.1,  -5. ,
        27.6,   6.6,  -1.1,  -0. ,  -0. ,  -0. ,  -0. ,  -0. ,   0. ,
         1.7,   7.2,  23.1,   0. ,   0. ,   0. ,   1.1,   0. ,   0.5,
         0. ,  -1.3,   0. ,   0. ,   1.1,   0. ,  -0. ,   0. ,  -0. ,
       -13. ,   0. ,  -4.9,  -0.7,  -0. ,  -0. ,  -0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,  -0. ,  -0. ,  -0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,  -0.2,   0.4,  -1.9,  -0. ,  18.7,   0. ,
        -1.1,  -3.5,   0. ,   2. , -10.2,  -0. ,   0. ,   0. ,  12.8,
        -0.8,  -0. ,  -1.1,   0. ,   0. ,   2.1,   0. ,  -0. ,  -2.1,
        -0. ,  -0. ,   0. ])

In [108]:
model = Lasso(alpha=.5)
model.fit(x_train, y_train)
np.round(model.coef_, 1)


Out[108]:
array([  1.3,  -1. ,   0.8,   0. ,  -1.7,  -0.7,   9. ,   8.9,  -0.1,
         0. ,   0. ,   0. ,  -0. ,   0. ,   0. ,   0. ,   0. ,  -0. ,
         0. ,  -0.7,  -0. ,   0. ,   0. ,  -0. ,  -0. ,  -0. ,  -5.2,
         2.6,   8.7,   0. ,  -0. ,  -0. ,   0. ,   0. ,   2.4,   0. ,
         0. ,  -7. ,  -0. ,  -0. ,  -0. ,   0. ,   0. ,   0. ,  -0. ,
         0. ,   0. ,  -0. ,  -0. ,  -0. ,   0. ,  -0. ,  -0. ,   0. ,
         0. ,   0. ,   8.2,   0. ,   0. ,   0. ,   0. ,  -0. ,  -0. ,
         0. ,  -0. ,   0. ,   0. ,   0. ,   0. ,  -0. ,   0. ,  -0. ,
        -0. ,  -0. ,  -0. ,  -0. ,  -0. ,  -0. ,  -0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,  -0. ,  -0. ,  -0. ,  -0. ,   0. ,   0. ,
        -0. ,   0. ,  -0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,  -0.1,   0.4,  -0. ,   0. ,   0. ,   0. ,
        -1.1,  -1.8,   0. ,   0. ,  -0. ,  -0. ,  -0. ,   0. ,  13.1,
        -0.1,   0. ,  -0. ,   0. ,   0. ,   0. ,   0. ,  -0. ,  -0. ,
        -0. ,  -0. ,  -0. ])
  1. b. How does this model perform on the test set?

In [109]:
plt.scatter(lin_regr.predict(x_test), y_test)
plt.xlabel('predicted values')
plt.ylabel('actual values')
plt.show()


Better.

  1. No matter how high I make alpha, the coefficient on crossing ("number of nearby crosswalks") never goes away. Try a simple linear regression on just that variable.

In [110]:
x = stations_vals[list(stations_vals.columns.values[111:112])]
y = stations_vals[list(stations_vals.columns.values[1:2])]
lin_regr = linear_model.LinearRegression()
lin_regr.fit(x, y)


Out[110]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [115]:
plt.scatter(lin_regr.predict(x), y)
plt.xlabel('predicted value')
plt.ylabel('actual value')
plt.show()



In [ ]:


In [ ]: