In [62]:
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa
%matplotlib inline
# Our new libraries.
from sklearn import datasets
from mpl_toolkits.mplot3d import Axes3D
import mayavi.mlab as mlab
iris = datasets.load_iris()
In [63]:
print iris['DESCR']
In [64]:
iris
Out[64]:
In [65]:
X = iris['data']
y = iris['target']
In [66]:
py.plot(X[y==0,0],X[y==0,1],'r.')
py.plot(X[y==1,0],X[y==1,1],'g.')
py.plot(X[y==2,0],X[y==2,1],'b.')
Out[66]:
In [67]:
py.plot(X[y==0,2],X[y==0,3],'r.')
py.plot(X[y==1,2],X[y==1,3],'g.')
py.plot(X[y==2,2],X[y==2,3],'b.')
Out[67]:
In [68]:
fig = py.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(X[y==0, 0], X[y==0, 1], X[y==0, 2], c='r')
ax.scatter(X[y==1, 0], X[y==1, 1], X[y==1, 2], c='g')
ax.scatter(X[y==2, 0], X[y==2, 1], X[y==2, 2], c='b')
py.show()
In [69]:
mlab.clf()
mlab.points3d(X[y==0, 0], X[y==0, 1], X[y==0, 2],color=(1,0,0))
mlab.points3d(X[y==1, 0], X[y==1, 1], X[y==1, 2],color=(0,1,0))
mlab.points3d(X[y==2, 0], X[y==2, 1], X[y==2, 2],color=(0,0,1))
mlab.axes()
mlab.show()
Is there a more principled way to look at the data? Yes! Let's go back to the notes.
Some sample data to demonstrate PCA on.
In [70]:
mu1 = np.array([0,0,0])
mu2 = np.array([6,0,0])
np.random.seed(123)
Sigma = np.matrix(np.random.normal(size=[3,3]))
# U,E,VT = np.linalg.svd(Sigma)
# E[0] = 1
# E[1] = 1
# E[2] = 1
# Sigma = U*np.diag(E)*VT
Xrandom1 = np.random.multivariate_normal(mu1,np.array(Sigma*Sigma.T),size=500)
Xrandom2 = np.random.multivariate_normal(mu2,np.array(Sigma*Sigma.T),size=500)
Plot the data so that it is "spread out" as much as possible.
In [71]:
mlab.clf()
mlab.points3d(Xrandom1[:,0], Xrandom1[:,1], Xrandom1[:,2],color=(1,0,0))
mlab.points3d(Xrandom2[:,0], Xrandom2[:,1], Xrandom2[:,2],color=(0,1,0))
mlab.axes()
mlab.show()
Can do the same thing with our classification data.
In [72]:
from sklearn.decomposition import PCA
X2D = PCA(n_components=3).fit_transform(X)
py.plot(X2D[y==0,0],X2D[y==0,1],'r.')
py.plot(X2D[y==1,0],X2D[y==1,1],'g.')
py.plot(X2D[y==2,0],X2D[y==2,1],'b.')
Out[72]:
Just as one can project from a high dimensional space to a two-dimensional space, one can also do the same thing to project to a three-dimensional space.
In [73]:
fig = py.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X3D = PCA(n_components=3).fit_transform(X)
ax.scatter(X3D[y==0, 0], X3D[y==0, 1], X3D[y==0, 2], c='r')
ax.scatter(X3D[y==1, 0], X3D[y==1, 1], X3D[y==1, 2], c='g')
ax.scatter(X3D[y==2, 0], X3D[y==2, 1], X3D[y==2, 2], c='b')
py.show()
And do the same with Mayavi.
In [74]:
mlab.clf()
mlab.points3d(X3D[y==0, 0], X3D[y==0, 1], X3D[y==0, 2],color=(1,0,0))
mlab.points3d(X3D[y==1, 0], X3D[y==1, 1], X3D[y==1, 2],color=(0,1,0))
mlab.points3d(X3D[y==2, 0], X3D[y==2, 1], X3D[y==2, 2],color=(0,0,1))
mlab.axes()
mlab.show()
Let's go back to the notes for our first algorithm.
In [75]:
# Load in the support vector machine (SVM) library
from sklearn import svm
In [77]:
# If there is one thing that I want to harp on, it is the difference
# between testing and training errors! So, here we create a training
# set on which we computer the parameters of our algorithm, and a
# testing set for seeing how well we generalize (and work on real
# world problems).
np.random.seed(1236)
perm = np.random.permutation(len(y))
trainSize = 100
Xtrain = X[perm[:trainSize],0:2]
Xtest = X[perm[trainSize:],0:2]
yHat = np.zeros([len(y)])
# Exists a separator
#yHat[np.logical_or(y==1,y==2)] = 1
# No perfect separator
#yHat[np.logical_or(y==1,y==0)] = 1
# All the data
yHat = y
yHattrain = yHat[perm[:trainSize]]
yHattest = yHat[perm[trainSize:]]
But why do you do this? See the notes.
In [78]:
# Some parameters we can get to play with
# If there is no perfect separator then how much do you penalize points
# that lay on the wrong side?
C = 100.
# The shape of the loss function for points that lay on the wrong side.
loss = 'l2'
In [79]:
# Run the calculation!
clf = svm.LinearSVC(loss=loss,C=C)
clf.fit(Xtrain, yHattrain)
Out[79]:
In [80]:
# Make some plots, inspired by scikit-learn tutorial
from matplotlib.colors import ListedColormap
# step size in the mesh for plotting the decision boundary.
h = .02
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
py.scatter(Xtrain[:, 0], Xtrain[:, 1], c=yHattrain, cmap=cmap_bold,marker='o')
py.scatter(Xtest[:, 0], Xtest[:, 1], c=yHattest, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()
In [81]:
# Print out some metrics
print 'training score',clf.score(Xtrain,yHattrain)
print 'testing score',clf.score(Xtest,yHattest)
Back to the notes to define our next method.
In [82]:
# Import the K-NN solver
from sklearn import neighbors
In [83]:
# If there is one thing that I want to harp on, it is the difference
# between testing and training errors! So, here we create a training
# set on which we computer the parameters of our algorithm, and a
# testing set for seeing how well we generalize (and work on real
# world problems).
np.random.seed(123)
perm = np.random.permutation(len(y))
trainSize = 50
Xtrain = X[perm[:trainSize],0:2]
Xtest = X[perm[trainSize:],0:2]
ytrain = y[perm[:trainSize]]
ytest = y[perm[trainSize:]]
In [84]:
# Some parameters to play around with
# The number of neighbors to use.
n_neighbors = 7
#weights = 'distance'
weights = 'uniform'
In [85]:
# Run the calculation
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(Xtrain, ytrain)
Out[85]:
In [86]:
# Make some plots inspired by sci-kit learn tutorial
# step size in the mesh for plotting the decision boundary.
h = .02
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
py.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
py.scatter(Xtrain[:, 0], Xtrain[:, 1], c=ytrain, cmap=cmap_bold,marker='o')
py.scatter(Xtest[:, 0], Xtest[:, 1], c=ytest, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()
In [87]:
# Print out some scores.
print 'training score',clf.score(Xtrain,ytrain)
print 'testing score',clf.score(Xtest,ytest)
Back to the notes.
In [2]:
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa
# Our new libraries.
from sklearn import cross_validation, linear_model, feature_selection, metrics
import mayavi.mlab as mlab
In [3]:
# Read in the data using
Xy = pa.read_csv('Advertising.csv')
In [4]:
# Take a look at the contents.
Xy
Out[4]:
In [5]:
# Normalize data
# We do this to make plotting and processing easier. Many Sklearn functions do this
# for you behind the scenes, but we do it explicitly.
# Note, that this is a cousing of the physics idea of nondimensionalization. Think
# about the case where TV was measured in millions, while Radio was measured in
# thousands. One could imagine TV totally washing out the effect of Radio.
# In effect, after normalization, each predictor now stands on an "even footing".
#
# Is this always a good idea?
Xy = (Xy-Xy.min())/(Xy.max()-Xy.min())
Xy
Out[5]:
In [6]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV']]
y = Xy.ix[:,['Sales']]
In [7]:
# Last time we did this by hand, now we are smarter and use the sklearn
# routine. This routine splits data into training and testing subsets.
cross_validation.train_test_split([1,2,3,4,5],
[6,7,8,9,10],
test_size=0.4,
random_state=5)
Out[7]:
In [8]:
# Now we do it for the real data.
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
y,
test_size=0.8)
In [9]:
# Let's take a quick look at the data.
X_train
Out[9]:
In [10]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)
Out[10]:
In [11]:
# There are the slope and intercept of the line we computed.
# Beta_0
print reg.intercept_
# Beta_1
print reg.coef_
In [12]:
# Do a plot
plotX = np.linspace(0,1,100)
plotY = reg.predict(np.matrix(plotX).T)
py.plot(X_train,y_train,'ro')
py.plot(X_test,y_test,'go')
py.plot(plotX,plotY,'b-')
Out[12]:
In [13]:
# Use the metrics package to print our errors. See discussion on slides.
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))
Back to slides.
In [14]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV','Radio']]
y = Xy.ix[:,['Sales']]
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
y,
test_size=0.8,
random_state=123)
In [15]:
# Plot the data to get a feel for it.
mlab.clf()
mlab.points3d(X_train.ix[:,0]/X.ix[:,0].std(),
X_train.ix[:,1]/X.ix[:,1].std(),
y_train.ix[:,0]/y.ix[:,0].std(),
color=(1,0,0), scale_factor=0.2)
mlab.points3d(X_test.ix[:,0]/X.ix[:,0].std(),
X_test.ix[:,1]/X.ix[:,1].std(),
y_test.ix[:,0]/y.ix[:,0].std(),
color=(0,1,0), scale_factor=0.2)
mlab.axes()
mlab.show()
In [16]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)
Out[16]:
In [17]:
# Create data for plotting
size=10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
np.array([xPlot.flatten(),yPlot.flatten()])
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
yPlot.flatten()])))
zPlot = zPlot.reshape([size,size])
In [18]:
# Since we will be plotting many times, we
def myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot,size=10,scale_factor=0.05):
mlab.clf()
mlab.points3d(X_train.ix[:,0],
X_train.ix[:,1],
y_train.ix[:,0],
color=(1,0,0), scale_factor=scale_factor)
mlab.points3d(X_test.ix[:,0],
X_test.ix[:,1],
y_test.ix[:,0],
color=(0,1,0), scale_factor=scale_factor)
mlab.mesh(xPlot,yPlot,zPlot,color=(0,0,1))
mlab.axes()
mlab.show()
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)
In [19]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))
Back to the notes.
In [20]:
# Now we try non-linear fittng. See notes for details.
# Note that we add a new column which is a *non-linear* function
# of the original data!
XyNonlinear = Xy.copy()
XyNonlinear['TV*Radio'] = Xy['TV']*Xy['Radio']
# Select out our predictor columns and our response columns
X = XyNonlinear.ix[:,['TV','Radio','TV*Radio']]
y = XyNonlinear.ix[:,['Sales']]
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
y,
test_size=0.8,
random_state=123)
In [21]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)
Out[21]:
In [22]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
yPlot.flatten(),
(xPlot*yPlot).flatten()])))
zPlot = zPlot.reshape([size,size])
In [23]:
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)
In [24]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))
Back to the notes.
In [25]:
# What about adding many non-linear combinations! See notes for details.
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])
for i in range(degree):
for j in range(degree):
XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
y,
test_size=0.8,
random_state=123)
In [26]:
# Run the solver
regOver = linear_model.LinearRegression(fit_intercept=True)
regOver.fit(X_train,y_train)
Out[26]:
In [27]:
print regOver.intercept_
print regOver.coef_
In [28]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
tmp = []
for i in range(degree):
for j in range(degree):
tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )
zPlot = regOver.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])
In [29]:
# Plot the data
# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
test_size=0.8,
random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)
In [30]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,regOver.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,regOver.predict(X_test))
Back to notes.
In [31]:
# Fortunately, there is a *lot* that one can do to help. It is possible to have
# many predictors but still get good answers. See notes for details...
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])
names = []
for i in range(degree):
for j in range(degree):
XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
names.append('TV**%d*Radio**%d'%(i,j))
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
y,
test_size=0.8,
random_state=123)
In [32]:
# We can try None and 3 to see what we get.
selector = feature_selection.RFE(regOver,n_features_to_select=3)
selector.fit(X_train,y_train)
Out[32]:
In [33]:
# Print out the predictors we use. These are the predictors selection by the RFE algorithm
# as the most important.
for i in range(len(names)):
print names[i],
print selector.get_support()[i]
In [34]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
tmp = []
for i in range(degree):
for j in range(degree):
tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )
zPlot = selector.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])
In [35]:
# Plot the data
# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
test_size=0.8,
random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)
In [36]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,selector.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,selector.predict(X_test))
Back to notes.
In [37]:
# Lasso regression is another method for doing feature selection.
# It is, by far, by favorite it is a close cousin of my personal
# research topic. See notes for details...
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])
names = []
for i in range(degree):
for j in range(degree):
XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
names.append('TV**%d*Radio**%d'%(i,j))
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
y,
test_size=0.8,
random_state=123)
In [38]:
# Run the solver
regLasso = linear_model.Lasso(alpha=0.002,fit_intercept=True,normalize=True)
regLasso.fit(X_train,y_train)
Out[38]:
In [39]:
# Print out the predictors we use. These betas with non-zero weights are those
# selected by the Lasso algorithm as being the most important. What do you notice?
print regLasso.intercept_
for i in range(len(regLasso.coef_)):
print names[i],regLasso.coef_[i]
In [40]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
tmp = []
for i in range(degree):
for j in range(degree):
tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )
zPlot = regLasso.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])
In [41]:
# Plot the data
# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
test_size=0.8,
random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)
In [42]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,regLasso.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,regLasso.predict(X_test))
In [ ]:
In [ ]: