header
In [32]:
%pylab inline
get some sample data
In [33]:
from sklearn import datasets
iris = datasets.load_iris()
what's available in the data?
In [34]:
# the pre-loaded datasets are "Bunches", which are dictionary-like (but the algorithms will always take arrays)
print "data dimensions (array): \n", iris.data.shape
print
print "bunch keys: \n", iris.keys()
print
print "feature names: \n", iris.feature_names
print
#print iris.DESCR
inspect the data
In [35]:
idx = 6
print "example iris features (4 features per sample): \n", iris.data[:idx]
print "\nexample iris labels: \n", iris.target[:idx]
In [36]:
import matplotlib.pyplot as plt
figsize(12,4)
subplot(131)
scatter(iris.data[:, 0:1], iris.data[:, 1:2])
xlabel("sepal length (cm)")
ylabel("sepal width (cm)")
axis("tight")
subplot(132)
scatter(iris.data[:, 1:2], iris.data[:, 2:3])
xlabel("sepal width (cm)")
ylabel("petal length (cm)")
axis("tight")
subplot(133)
scatter(iris.data[:, 0:1], iris.data[:, 2:3])
xlabel("sepal length (cm)")
ylabel("petal length (cm)")
axis("tight")
Out[36]:
In [37]:
# let's look at three of the features at once
from mpl_toolkits.mplot3d import Axes3D
figsize(10,6)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(15, 60) # (elev, azim) : adjust these to change viewing angle!
z = iris.data[:, 2:3]
x = iris.data[:, 1:2]
y = iris.data[:, 0:1]
# let's "cheat" and color the data according to the labels that are already given to us
color = iris.target
ax.scatter(x, y, z, c=color)
xlabel("sepal length (cm)")
ylabel("sepal width (cm)")
ax.set_zlabel("petal length (cm)")
axis("tight")
Out[37]:
In [38]:
import numpy as np
import random
get the feature (data) and label (target) arrays set up for use in the estimator. split into train and test data, ~ 95:5 here
In [39]:
iris_X = iris.data
iris_y = iris.target
r = random.randint(0,100)
np.random.seed(r)
idx = np.random.permutation(len(iris_X))
subset = 30
iris_X_train = iris_X[idx[:-subset]] # all but last 'subset' rows
iris_y_train = iris_y[idx[:-subset]]
iris_X_test = iris_X[idx[-subset:]] # last 'subset' rows
iris_y_test = iris_y[idx[-subset:]]
In [40]:
figsize(5,5)
scatter(iris_X_train[:, 0:1]
, iris_X_train[:, 1:2]
, c="blue"
, s=30
, label="train data"
)
scatter(iris_X_test[:, 0:1]
, iris_X_test[:, 1:2]
, c="red"
, s=30
, label="test data"
)
xlabel("sepal length (cm)")
ylabel("sepal width (cm)")
legend()
axis("tight")
Out[40]:
create a nearest-neighbor classification estimator & fit the training data
In [41]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
# fit the model using the training data
knn.fit(iris_X_train, iris_y_train)
Out[41]:
use the trained knn
estimator to predict the classification of the test data
In [42]:
# predict the labels for the test data, using the trained model
iris_y_predict = knn.predict(iris_X_test)
# show the results (labels)
iris_y_predict
Out[42]:
and in this case, we can look at the actual "correct" answers... the labels that came with the data
In [43]:
iris_y_test
Out[43]:
sklearn
also has many built-in ways to gauge the "accuracy" of your trained estimator. the simplest is just "what fraction of our classifications did we get right?"
In [44]:
from sklearn.metrics import accuracy_score
accuracy_score(iris_y_test, iris_y_predict)
Out[44]:
how does the estimator result differ from the labels?
In [45]:
figsize(12,6)
subplot(221)
scatter(iris_X_test[:, 0:1] # real data
, iris_X_test[:, 1:2]
, c=iris_y_test # real labels
, s=100
, alpha=0.6
)
ylabel("sepal width (cm)")
title("real labels")
subplot(223)
scatter(iris_X_test[:, 0:1]
, iris_X_test[:, 2:3]
, c=iris_y_test
, s=100
, alpha=0.6
)
xlabel("sepal length (cm)")
ylabel("petal length (cm)")
subplot(222)
scatter(iris_X_test[:, 0:1] # real data
, iris_X_test[:, 1:2]
, c=iris_y_predict # predicted labels
, s=100
, alpha=0.6
)
ylabel("sepal width (cm)")
title("predicted labels")
subplot(224)
scatter(iris_X_test[:, 0:1]
, iris_X_test[:, 2:3]
, c=iris_y_predict
, s=100
, alpha=0.6
)
xlabel("sepal length (cm)")
ylabel("petal length (cm)")
Out[45]:
In [46]:
boston = datasets.load_boston()
boston.keys()
Out[46]:
In [47]:
# get the full info
#print boston.DESCR
do some inspection, because in general this is an important part...
In [48]:
print "dimensions: \n", boston.data.shape, boston.target.shape
print
print "features (defs in DESCR): \n", boston.feature_names
print
print "first row: \n", boston.data[:1]
let's assume we want to draw some conclusions between the number of rooms and the housing prices. get the right column from the description of the dataset. have a look.
In [49]:
rooms = boston.data[:, 5]
figsize(5,5)
scatter(rooms, boston.target, alpha=0.5)
xlabel("room count")
ylabel("cost ($1000s)")
axis("tight")
Out[49]:
ok, so we can work with this - there's definitely some correlation between these two vars. lets imagine that we just knew we wanted to fit this immediately, without all the inspection. furthermore, we wanted to build a model, fit the estimator, and then keep it around for use in an analysis later on! enter the time machine and see how we start...
In [50]:
# here comes the data
boston = datasets.load_boston()
# some goofyness to get a "column vector" for the estimator
b_X_tmp = boston.data[:, np.newaxis]
b_X = b_X_tmp[:, :, 5]
b_y = boston.target
# split it out into train/test
r = random.randint(0,100)
np.random.seed(r)
idx = np.random.permutation(len(b_X))
subset = 50
b_X_train = b_X[idx[:-subset]] # all but last 'subset' rows
b_y_train = b_y[idx[:-subset]]
b_X_test = b_X[idx[-subset:]] # last 'subset' rows
b_y_test = b_y[idx[-subset:]]
In [51]:
# get our estimator & fit to the training data
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
print lr.fit(b_X_train, b_y_train)
print
print "Coefficient: \n", lr.coef_
now for the magic...
In [52]:
import pickle
p = pickle.dumps(lr)
p # looks super useful, right?
Out[52]:
In [53]:
# write this fitted estimator (python object) to a byte stream
with open('./lin-reg.pkl', 'w') as f:
f.write(p)
In [54]:
# now, imagine you've previously created this file and stored it off somewhere...
with open('./lin-reg.pkl', 'r') as f:
new_lr = pickle.loads( f.read() )
print new_lr
print
print "Coefficient (compare to previous output): \n", new_lr.coef_
now use this model to fit some new data!
In [55]:
b_y_predict = new_lr.predict(b_X_test)
#b_y_predict # you can have a look at the result if you want
and have a look at how the model does. first, look at the fit through all of the data, then compare the fit results (test data) to the actual values
In [56]:
figsize(12,5)
subplot(121)
scatter(b_X, b_y, c="red")
plot(b_X, new_lr.predict(b_X), '-k')
axis('tight')
xlabel('room count')
ylabel('predicted price ($1000s)')
title("fit to all data")
subplot(122)
scatter(b_y_test, b_y_predict, c="green")
plot([0, 50], [0, 50], '--k')
axis('tight')
xlabel('true price ($1000s)')
ylabel('predicted price ($1000s)')
title("true- and predicted-value comparison (test data)")
Out[56]:
so, generally, the approach:
from sklearn import estimator
e = Estimator()
e.fit(train_samples [, train_labels])
e.predict(test_samples [, test_labels])
rejoice()
In [ ]: