In [1]:
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import cluster, datasets
from sklearn.metrics import confusion_matrix, mean_squared_error
import matplotlib.pylab as py
from matplotlib.colors import ListedColormap
%matplotlib inline
import numpy as np
iris = datasets.load_iris()
In [2]:
# Get some reasonable names.
X = iris['data']
y = iris['target']
In [13]:
yHat = np.zeros([len(y)])
# Exists a separator
#yHat[np.logical_or(y==1,y==2)] = 1
# No perfect separator
yHat[np.logical_or(y==1,y==0)] = 1
In [14]:
# Now we do it for the real data.
pair = [0,1]
X_train,X_test,y_train,y_test = train_test_split(X[:,pair],
yHat,
test_size=0.5)
In [15]:
# Make a plot
from matplotlib.colors import ListedColormap
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold,marker='o')
Out[15]:
In [25]:
# Run the classifier
clf = DecisionTreeClassifier(max_depth=10,random_state=1234).fit(X_train, y_train)
In [26]:
# Make some plots, inspired by scikit-learn tutorial
# step size in the mesh for plotting the decision boundary.
h = .02
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, pair[0]].min() - 1, X[:, pair[0]].max() + 1
y_min, y_max = X[:, pair[1]].min() - 1, X[:, pair[1]].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
py.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold,marker='o')
py.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()
In [27]:
# Print out some metrics
print "Training scores"
print clf.score(X_train,y_train)
print "Testing scores"
print clf.score(X_test,y_test)
In [28]:
# Create a random dataset. Inspired by
# http://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))
In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,
y,
test_size=0.5)
In [30]:
# Plot the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')
Out[30]:
In [31]:
# Run the decision tree regression algorithm
reg = DecisionTreeRegressor(max_depth=5,random_state=1234).fit(X_train, y_train)
In [32]:
# Plot the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')
X_plot = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
py.plot(X_plot,reg.predict(np.sort(X_plot)),'g')
Out[32]:
In [34]:
# Use the metrics package to print our errors
print 'training error'
print mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print mean_squared_error(y_test,reg.predict(X_test))
In [35]:
# Get some reasonable names.
X = iris['data']
y = iris['target']
In [36]:
yHat = np.zeros([len(y)])
# Exists a separator
yHat[np.logical_or(y==1,y==2)] = 1
# No perfect separator
#yHat[np.logical_or(y==1,y==0)] = 1
In [37]:
# Now we do it for the real data.
pair = [0,1]
X_train,X_test,y_train,y_test = train_test_split(X[:,pair],
yHat,
test_size=0.5)
In [51]:
# Run the classifier, try 10, 20, and 100 estimators
clf = RandomForestClassifier(max_depth=4,n_estimators=20,random_state=1234).fit(X_train, y_train)
In [52]:
# Make some plots, inspired by scikit-learn tutorial
# step size in the mesh for plotting the decision boundary.
h = .02
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, pair[0]].min() - 1, X[:, pair[0]].max() + 1
y_min, y_max = X[:, pair[1]].min() - 1, X[:, pair[1]].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
py.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold,marker='o')
py.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold,marker='+')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()
In [53]:
# Print out some metrics
print "Training scores"
print clf.score(X_train,y_train)
print "Testing scores"
print clf.score(X_test,y_test)
In [35]:
# Create a random dataset. Inspired by
# http://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))
In [36]:
X_train,X_test,y_train,y_test = train_test_split(X,
y,
test_size=0.5)
In [37]:
# Plot the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')
Out[37]:
In [38]:
# Run the decision tree regression algorithm
reg = RandomForestRegressor(max_depth=1,n_estimators=100,random_state=1234).fit(X_train, y_train)
In [39]:
# Plot the data
py.plot(X_train, y_train, 'bo')
py.plot(X_test, y_test, 'r+')
X_plot = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
py.plot(X_plot,reg.predict(np.sort(X_plot)),'g')
Out[39]:
In [40]:
# Use the metrics package to print our errors
print 'training error'
print mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print mean_squared_error(y_test,reg.predict(X_test))
In [54]:
# Load in the data.
iris = datasets.load_iris()
In [55]:
# Get some reasonable names.
X = iris['data']
y = iris['target']
In [59]:
# Note, there is no test set here. Why!?
# Bad projection
pair = [0,1]
# Good projection
#pair = [1,2]
Xtrain = X[:,pair]
In [60]:
# We make a K-means classifier
k_means = cluster.KMeans(n_clusters=3)
# and run it. Note, "y" does not appear! That is what makes it unsupervised.
k_means.fit(Xtrain)
Out[60]:
In [61]:
# Make some plots, inspired by scikit-learn tutorial
# step size in the mesh for plotting the decision boundary.
h = .02
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = Xtrain[:, 0].min() - 1, Xtrain[:, 0].max() + 1
y_min, y_max = Xtrain[:, 1].min() - 1, Xtrain[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
py.figure(1, figsize=(8, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
py.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
py.scatter(Xtrain[:, 0], Xtrain[:, 1], c=y, cmap=cmap_bold,marker='o')
py.xlim(xx.min(), xx.max())
py.ylim(yy.min(), yy.max())
py.show()
In [62]:
# print out some scores
print confusion_matrix(y,k_means.predict(Xtrain))
In [63]:
# The true y
y
Out[63]:
In [64]:
# Of course, the labels here need to be considered carefully!
k_means.predict(Xtrain)
Out[64]:
Back to the notes.
In [65]:
# Load in mayavi
import mayavi.mlab as mlab
In [66]:
# Load in the data
from sklearn import manifold, datasets
X, color = datasets.samples_generator.make_swiss_roll(n_samples=4000)
In [67]:
# Take a look at it
mlab.clf()
mlab.points3d(X[:, 0], X[:, 1], X[:, 2],mode='point')
mlab.axes()
mlab.show()
In [68]:
# Compute LLE
X_r, err = manifold.locally_linear_embedding(X, n_neighbors=10,
n_components=2,method='ltsa')
In [69]:
# Look at the embedding
X_r[1:10,:]
Out[69]:
In [70]:
# The embedding
py.plot(X_r[:,0],X_r[:,1],'b.')
Out[70]:
In [71]:
m = {'X1':'petal width','X2':'sepal width'}
In [72]:
m['X1']
Out[72]:
In [73]:
m = {}
for i in range(300):
m['X%d'%i] = 'foo'
In [74]:
m['X1']
Out[74]:
In [ ]: