In [1]:
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa
%matplotlib inline
# Our new libraries.
from sklearn import cross_validation, linear_model, feature_selection, metrics
import mayavi.mlab as mlab
In [4]:
# Read in the data using
Xy = pa.read_csv('Advertising.csv')
In [5]:
# Take a look at the contents.
Xy
Out[5]:
In [6]:
# Normalize data
# We do this to make plotting and processing easier. Many Sklearn functions do this
# for you behind the scenes, but we do it explicitly.
# Note, that this is a cousing of the physics idea of nondimensionalization. Think
# about the case where TV was measured in millions, while Radio was measured in
# thousands. One could imagine TV totally washing out the effect of Radio.
# In effect, after normalization, each predictor now stands on an "even footing".
#
# Is this always a good idea?
Xy = (Xy-Xy.min())/(Xy.max()-Xy.min())
Xy
Out[6]:
In [7]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV']]
y = Xy.ix[:,['Sales']]
In [24]:
# Last time we did this by hand, now we are smarter and use the sklearn
# routine. This routine splits data into training and testing subsets.
cross_validation.train_test_split([1,2,3,4,5],
[6,7,8,9,10],
test_size=0.4,
random_state=5)
Out[24]:
In [25]:
# Now we do it for the real data.
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
y,
test_size=0.8)
In [26]:
# Let's take a quick look at the data.
X_train
Out[26]:
In [27]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)
Out[27]:
In [28]:
# There are the slope and intercept of the line we computed.
# Beta_0
print reg.intercept_
# Beta_1
print reg.coef_
In [29]:
# Do a plot
plotX = np.linspace(0,1,100)
plotY = reg.predict(np.matrix(plotX).T)
py.plot(X_train,y_train,'ro')
py.plot(X_test,y_test,'go')
py.plot(plotX,plotY,'b-')
Out[29]:
In [30]:
# Use the metrics package to print our errors. See discussion on slides.
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))
Back to slides.
In [31]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV','Radio']]
y = Xy.ix[:,['Sales']]
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
y,
test_size=0.8,
random_state=123)
In [32]:
# Plot the data to get a feel for it.
mlab.clf()
mlab.points3d(X_train.ix[:,0]/X.ix[:,0].std(),
X_train.ix[:,1]/X.ix[:,1].std(),
y_train.ix[:,0]/y.ix[:,0].std(),
color=(1,0,0), scale_factor=0.2)
mlab.points3d(X_test.ix[:,0]/X.ix[:,0].std(),
X_test.ix[:,1]/X.ix[:,1].std(),
y_test.ix[:,0]/y.ix[:,0].std(),
color=(0,1,0), scale_factor=0.2)
mlab.axes()
mlab.show()
In [33]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)
Out[33]:
In [34]:
# Create data for plotting
size=10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
np.array([xPlot.flatten(),yPlot.flatten()])
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
yPlot.flatten()])))
zPlot = zPlot.reshape([size,size])
In [35]:
# Since we will be plotting many times, we
def myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot,size=10,scale_factor=0.05):
mlab.clf()
mlab.points3d(X_train.ix[:,0],
X_train.ix[:,1],
y_train.ix[:,0],
color=(1,0,0), scale_factor=scale_factor)
mlab.points3d(X_test.ix[:,0],
X_test.ix[:,1],
y_test.ix[:,0],
color=(0,1,0), scale_factor=scale_factor)
mlab.mesh(xPlot,yPlot,zPlot,color=(0,0,1))
mlab.axes()
mlab.show()
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)
In [36]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))
Back to the notes.
In [ ]:
In [ ]:
In [ ]: