Loading in the libraries.



In [1]:

    
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import cluster, datasets
from sklearn.metrics import confusion_matrix, mean_squared_error
import matplotlib.pylab as py
from matplotlib.colors import ListedColormap
%matplotlib inline

import numpy as np

iris = datasets.load_iris()

Decision Trees



In [2]:

    
# Get some reasonable names.
X = iris['data']
y = iris['target']



In [13]:

    
yHat = np.zeros([len(y)])

# Exists a separator
#yHat[np.logical_or(y==1,y==2)] = 1
# No perfect separator
yHat[np.logical_or(y==1,y==0)] = 1



In [14]:

    
# Now we do it for the real data.
pair = [0,1]
X_train,X_test,y_train,y_test = train_test_split(X[:,pair],
                                                 yHat,
                                                 test_size=0.5)



In [15]:

    
# Make a plot
from matplotlib.colors import ListedColormap
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold,marker='o')









    Out[15]:





<matplotlib.collections.PathCollection at 0xcd72b00>



In [25]:

    
# Run the classifier
clf = DecisionTreeClassifier(max_depth=10,random_state=1234).fit(X_train, y_train)



In [26]:

    
# Make some plots, inspired by scikit-learn tutorial

# step size in the mesh for plotting the decision boundary.
h = .02  
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, pair[0]].min() - 1, X[:, pair[0]].max() + 1
y_min, y_max = X[:, pair[1]].min() - 1, X[:, pair[1]].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
py.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold,marker='o')
py.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()



In [27]:

    
# Print out some metrics
print "Training scores"
print clf.score(X_train,y_train)
print "Testing scores"
print clf.score(X_test,y_test)









    



Training scores
0.933333333333
Testing scores
0.72

Back to the notes.

Regression Tree



In [28]:

    
# Create a random dataset.  Inspired by
# http://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))



In [29]:

    
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.5)



In [30]:

    
# Plot  the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')









    Out[30]:





[<matplotlib.lines.Line2D at 0xcd8e198>]



In [31]:

    
# Run the decision tree regression algorithm
reg = DecisionTreeRegressor(max_depth=5,random_state=1234).fit(X_train, y_train)



In [32]:

    
# Plot  the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')
X_plot = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
py.plot(X_plot,reg.predict(np.sort(X_plot)),'g')









    Out[32]:





[<matplotlib.lines.Line2D at 0xab01b70>]



In [34]:

    
# Use the metrics package to print our errors
print 'training error'
print mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print mean_squared_error(y_test,reg.predict(X_test))









    



training error
0.0347195221953
testing error
0.397522038348

Back to the notes.

Random Forest Classifier



In [35]:

    
# Get some reasonable names.
X = iris['data']
y = iris['target']



In [36]:

    
yHat = np.zeros([len(y)])

# Exists a separator
yHat[np.logical_or(y==1,y==2)] = 1
# No perfect separator
#yHat[np.logical_or(y==1,y==0)] = 1



In [37]:

    
# Now we do it for the real data.
pair = [0,1]
X_train,X_test,y_train,y_test = train_test_split(X[:,pair],
                                                 yHat,
                                                 test_size=0.5)



In [51]:

    
# Run the classifier, try 10, 20, and 100 estimators
clf = RandomForestClassifier(max_depth=4,n_estimators=20,random_state=1234).fit(X_train, y_train)



In [52]:

    
# Make some plots, inspired by scikit-learn tutorial

# step size in the mesh for plotting the decision boundary.
h = .02  
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, pair[0]].min() - 1, X[:, pair[0]].max() + 1
y_min, y_max = X[:, pair[1]].min() - 1, X[:, pair[1]].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
py.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold,marker='o')
py.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()



In [53]:

    
# Print out some metrics
print "Training scores"
print clf.score(X_train,y_train)
print "Testing scores"
print clf.score(X_test,y_test)









    



Training scores
1.0
Testing scores
0.973333333333

Random Forest Regressor



In [35]:

    
# Create a random dataset.  Inspired by
# http://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))



In [36]:

    
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.5)



In [37]:

    
# Plot  the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')









    Out[37]:





[<matplotlib.lines.Line2D at 0xd997390>]



In [38]:

    
# Run the decision tree regression algorithm
reg = RandomForestRegressor(max_depth=1,n_estimators=100,random_state=1234).fit(X_train, y_train)



In [39]:

    
# Plot  the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')
X_plot = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
py.plot(X_plot,reg.predict(np.sort(X_plot)),'g')









    Out[39]:





[<matplotlib.lines.Line2D at 0xd7ddb00>]



In [40]:

    
# Use the metrics package to print our errors
print 'training error'
print mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print mean_squared_error(y_test,reg.predict(X_test))









    



training error
0.164379301106
testing error
0.227087831251

Back to the notes.

K-means



In [54]:

    
# Load in the data.
iris = datasets.load_iris()



In [55]:

    
# Get some reasonable names.
X = iris['data']
y = iris['target']



In [59]:

    
# Note, there is no test set here.   Why!?
# Bad projection
pair = [0,1]
# Good projection
#pair = [1,2]

Xtrain = X[:,pair]



In [60]:

    
# We make a K-means classifier
k_means = cluster.KMeans(n_clusters=3)
# and run it.  Note, "y" does not appear!  That is what makes it unsupervised.
k_means.fit(Xtrain)









    Out[60]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)



In [61]:

    
# Make some plots, inspired by scikit-learn tutorial

# step size in the mesh for plotting the decision boundary.
h = .02  
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = Xtrain[:, 0].min() - 1, Xtrain[:, 0].max() + 1
y_min, y_max = Xtrain[:, 1].min() - 1, Xtrain[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
py.scatter(Xtrain[:, 0], Xtrain[:, 1], c=y, cmap=cmap_bold,marker='o')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()



In [62]:

    
# print out some scores
print confusion_matrix(y,k_means.predict(Xtrain))









    



[[ 0 50  0]
 [12  0 38]
 [35  0 15]]



In [63]:

    
# The true y
y









    Out[63]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])



In [64]:

    
# Of course, the labels here need to be considered carefully!
k_means.predict(Xtrain)









    Out[64]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2,
       0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0,
       2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2])

Back to the notes.

Manifold learning



In [65]:

    
# Load in mayavi
import mayavi.mlab as mlab









    



WARNING:traits.has_traits:DEPRECATED: traits.has_traits.wrapped_class, 'the 'implements' class advisor has been deprecated. Use the 'provides' class decorator.



In [66]:

    
# Load in the data
from sklearn import manifold, datasets
X, color = datasets.samples_generator.make_swiss_roll(n_samples=4000)



In [67]:

    
# Take a look at it
mlab.clf()
mlab.points3d(X[:, 0], X[:, 1], X[:, 2],mode='point')
mlab.axes()
mlab.show()



In [68]:

    
# Compute LLE
X_r, err = manifold.locally_linear_embedding(X, n_neighbors=10,
                                             n_components=2,method='ltsa')



In [69]:

    
# Look at the embedding
X_r[1:10,:]









    Out[69]:





array([[ 0.0096458 ,  0.00030322],
       [-0.00406384,  0.00891706],
       [-0.01417255,  0.0203004 ],
       [-0.00997081, -0.01130122],
       [ 0.01105055,  0.02057282],
       [ 0.02025372,  0.00567858],
       [ 0.00749502,  0.01821783],
       [-0.01839235,  0.00034735],
       [ 0.00950215,  0.00634468]])



In [70]:

    
# The embedding
py.plot(X_r[:,0],X_r[:,1],'b.')









    Out[70]:





[<matplotlib.lines.Line2D at 0x19d09b70>]



In [71]:

    
m = {'X1':'petal width','X2':'sepal width'}



In [72]:

    
m['X1']









    Out[72]:





'petal width'



In [73]:

    
m = {}
for i in range(300):
    m['X%d'%i] = 'foo'



In [74]:

    
m['X1']









    Out[74]:





'foo'



In [ ]: