In [1]:
# allow importing modules and datafiles up one directory
import os
os.chdir('../')
import pandas as pd
import numpy as np
import datetime
import math
import datatables.traveltime
In [2]:
data = datatables.traveltime.read('data/traveltime.task.train')
data.head()
Out[2]:
'y' is travel time in seconds.
In [3]:
def extract_features(data):
# Turn list into a n*1 design matrix. At this stage, we only have a single feature in each row.
vol = data['volume'].values[:, np.newaxis]
# Add vol^2 as feature to allow quadratic regression
xs = np.hstack([vol, vol**2])
return xs
vol = data['volume'].values[:, np.newaxis]
xs = extract_features(data)
y = data['y'].values
In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(xs, y)
y_pred = regr.predict(xs)
plt.figure(figsize=(8,8))
plt.scatter(vol, y, color='black', label='actual')
plt.plot(vol, y_pred, color='blue', label='quadratic regression')
plt.title("Travel time vs volume. Princes Highway. Outbound. Wed 19 Aug 2015")
plt.ylabel("Travel Time from site 2409 to site 2425 (seconds)")
plt.xlabel("Volume at site 2433. Detector 6. Righthand lane. (vehicles per 15 minute interval)")
plt.legend(loc='lower right')
plt.xlim([0,None])
plt.ylim([0,None])
plt.show()
# http://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares
print('Intercept: %.2f' % regr.intercept_)
print('Coefficients: %s' % regr.coef_)
print('R^2 score: %.2f' % regr.score(xs, y))
In [5]:
test = datatables.traveltime.read('data/traveltime.task.test') # Traffic on Wed 27 Aug 2015
test_xs = extract_features(test)
test['pred'] = regr.predict(test_xs)
test['error'] = test['y'] - test['pred']
# todo: ensure data is a real number (complex numbers could be used to cheat)
rms_error = math.sqrt(sum(test['error']**2) / len(data))
In [6]:
test.head()
Out[6]:
In [7]:
rms_error
Out[7]: