Scikit-learn comes with a few standard datasets, for instance the iris and digits datasets for classification and the boston house prices dataset for regression.
In [2]:
# http://scikit-learn.org/stable/tutorial/basic/tutorial.html
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()
A dataset is a dictionary-like object that holds all the data and some metadata about the data. This data is stored in the .data
member, which is a n_samples, n_features
array. In the case of supervised problem, one or more response variables are stored in the .target
member. More details on the different datasets can be found in the dedicated section.
For instance, in the case of the digits dataset, digits.data
gives access to the features that can be used to classify the digits samples:
In [3]:
print(digits.data)
and digits.target
gives the ground truth for the digit dataset, that is the number corresponding to each digit image that we are trying to learn:
In [5]:
digits.target
Out[5]:
In [ ]:
In [6]:
digits.images[0]
Out[6]:
In [104]:
print(__doc__)
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
# License: BSD 3 clause
In [105]:
# Standard scientific Python imports
import matplotlib.pyplot as plt
# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics
In [106]:
# The digits dataset
digits = datasets.load_digits()
In [108]:
# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset. If we were working from image files, we could load them using
# matplotlib.pyplot.imread. Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
plt.subplot(2, 4, index + 1) # subplot(nrows, ncols, plot_number)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Training: %i' % label)
In [109]:
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
An example of an estimator is the class sklearn.svm.SVC that implements support vector classification. The constructor of an estimator takes as arguments the parameters of the model, but for the time being, we will consider the estimator as a black box:
In [110]:
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
In scikit-learn, an estimator for classification is a Python object that implements the methods fit(X, y)
and predict(T)
In [111]:
# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
Out[111]:
In [112]:
# Now predict the value of the digit on the second half:
expected = digits.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])
In [113]:
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(expected, predicted)))
In [114]:
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
In [115]:
images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
In [116]:
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
plt.subplot(2, 4, index + 5)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Prediction: %i' % prediction)
In [117]:
plt.show()
In [164]:
# Tentando fazer aparecer mais (32 primeiros):
# The digits dataset
digits = datasets.load_digits()
# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset. If we were working from image files, we could load them using
# matplotlib.pyplot.imread. Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:32]):
plt.subplot(8, 4, index + 1) # subplot(nrows, ncols, plot_number)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Training: %i' % label)
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
# Now predict the value of the digit on the second half:
expected = digits.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:32]):
plt.subplot(8, 4, 32)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Prediction: %i' % prediction)
plt.show()
In [166]:
# Tentando fazer aparecer mais (32 últimos):
# The digits dataset
digits = datasets.load_digits()
# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset. If we were working from image files, we could load them using
# matplotlib.pyplot.imread. Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[32:]):
plt.subplot(8, 4, 32) # subplot(nrows, ncols, plot_number)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Training: %i' % label)
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
# Now predict the value of the digit on the second half:
expected = digits.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[32:]):
plt.subplot(8, 4, 32)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Prediction: %i' % prediction)
plt.show()
In [1]:
# Aqui o mesmo dataset, mas com algoritmo de redes neurais:
# https://src-code.simons-rock.edu/git/MATH_CMPT_370_S17/Multilayer_Neural_Network/raw/master/NN.py
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import neural_network
from sklearn import metrics
#inspiration:
# (1) https://www.datacamp.com/community/tutorials/machine-learning-python#gs.geiQ7Ic
# (2) http://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html
# We want to load the dataset of digits
digits = datasets.load_digits()
# We want to separate the images from the rest of the info
images = digits.images
# separate the data of the images (basically the images vectorized)
X = digits.data
# Separate the target values from the rest of the info
y = digits.target
# partition the data into training data and test data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y)
# scale the data because multilayer preceptron may not converge
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
#scaler = preprocessing.StandardScaler()
#scaler.fit(X_train)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test)
# The neural network "architecture" is created here. We tell it to create 3 hidden layers
# with 45 nodes each
mlp = neural_network.MLPClassifier(hidden_layer_sizes = (20,20))
# now we send our training data in to have the weights and thresholds trained
mlp.fit(X_train, y_train)
# These are our predictions based on the trained network
predictions = mlp.predict(X_test)
# the confusion matrix gives the counts of how many of the predictions corresponds
# to the test output
print(metrics.confusion_matrix(y_test, predictions))
# precision and recall are provided to determine the
# accuracy of our predictions
print(metrics.classification_report(y_test,predictions))
# Now we want to visualize what is going on... hence why I went with image data
# Figure size (width, height) in inches
fig = plt.figure(figsize=(6, 6))
# Adjust the subplots
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# For just 64 images; note there are more than 64 images in the test set (450).
for i in range(64):
# Initialize the subplots: add a subplot in the grid of 8 by 8, at the i+1-th position
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
# Display an image at the i-th position
ax.imshow(X_test[i].reshape(8,8), cmap=plt.cm.binary, interpolation='nearest')
# label the image with the target value
ax.text(0, 7, str(y_test[i]))
# Show the plot
plt.show()
In [168]:
# Testando apenas o último dígito:
# Tentando fazer aparecer mais (32 últimos):
# The digits dataset
digits = datasets.load_digits()
# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset. If we were working from image files, we could load them using
# matplotlib.pyplot.imread. Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[-1:]):
plt.subplot(1, 1, 1) # subplot(nrows, ncols, plot_number)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Training: %i' % label)
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# We learn the digits on the first half of the digits
classifier.fit(digits.data[:-1], digits.target[:-1])
# Now predict the value of the digit on the second half:
expected = digits.target[-1:]
predicted = classifier.predict(digits.data[-1:])
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
images_and_predictions = list(zip(digits.images[-1:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[-1:]):
plt.subplot(1, 1, 1)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Prediction: %i' % prediction)
plt.show()
In [3]:
# http://scikit-learn.org/stable/tutorial/basic/tutorial.html#conventions
In [4]:
import numpy as np
from sklearn import random_projection
In [5]:
rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
X = np.array(X, dtype='float32')
X.dtype
Out[5]:
In [6]:
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.dtype
Out[6]:
In [28]:
#https://docs.scipy.org/doc/numpy/user/basics.types.html
import numpy as np
np.float16(1/3) # 0.33333334
Out[28]:
In [29]:
np.float32(1/3) # 0.33333334
Out[29]:
In [30]:
np.float64(np.float32(1/3)) # 0.3333333432674408
Out[30]:
In [31]:
np.float64(1/3) # 0.33333333333333331
Out[31]:
In this example, X is float32, which is cast to float64 by fit_transform(X). Regression targets are cast to float64, classification targets are maintained:
In [32]:
from sklearn import datasets
from sklearn.svm import SVC
iris = datasets.load_iris()
clf = SVC()
clf.fit(iris.data, iris.target)
Out[32]:
In [33]:
list(clf.predict(iris.data[:3]))
Out[33]:
In [34]:
clf.fit(iris.data, iris.target_names[iris.target])
Out[34]:
In [35]:
list(clf.predict(iris.data[:3]))
Out[35]:
Here, the first predict() returns an integer array, since iris.target (an integer array) was used in fit. The second predict() returns a string array, since iris.target_names was for fitting.
Hyper-parameters of an estimator can be updated after it has been constructed via the sklearn.pipeline.Pipeline.set_params method.
Calling fit() more than once will overwrite what was learned by any previous fit():
In [42]:
import numpy as np
from sklearn.svm import SVC
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
y = rng.binomial(1, 0.5, 100)
X_test = rng.rand(5, 10)
In [43]:
clf = SVC()
clf.set_params(kernel='linear').fit(X, y)
Out[43]:
In [44]:
clf.predict(X_test)
Out[44]:
In [45]:
clf.set_params(kernel='rbf').fit(X, y)
Out[45]:
In [46]:
clf.predict(X_test)
Out[46]:
In [83]:
# https://github.com/gmonce/scikit-learn-book/blob/master/Chapter%201%20%20-%20A%20Gentle%20Introduction%20to%20Machine%20Learning.ipynb
# Thank's, Raúl Garreta and Guillermo Moncecchi!
%pylab inline import IPython import sklearn as sk import numpy as np import matplotlib import matplotlib.pyplot as plt
print('IPython version:', IPython.version) print('numpy version:', np.version) print('scikit-learn version:', sk.version) print('matplotlib version:', matplotlib.version)
In [71]:
from sklearn import datasets
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
print(X_iris.shape, y_iris.shape)
print(X_iris[0], y_iris[0])
Dataset com 150 instâncias e 4 atributos (features) cada. Agora, separar o dataset em 75 para treinamento do classificador e 25% para avaliação, pegando duas features (sepal width e length).
We will also perform feature scaling: for each feature, calculate the average, subtract the mean value from the feature value, and divide the result by their standard deviation. After scaling, each feature will have a zero average, with a standard deviation of one. This standardization of values (which does not change their distribution, as you could verify by plotting the X values before and after scaling) is a common requirement of machine learning methods, to avoid that features with large values may weight too much on the final results.
In [75]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
# Get dataset with only the first two attributes
X, y = X_iris[:,:2], y_iris
# Split the dataset into a trainig and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
# random_state : int or RandomState. Pseudo-random number generator state used for random sampling.
# If random_state is None or np.random, then a randomly-initialized RandomState object is returned.
# If random_state is an integer, then it is used to seed a new RandomState object.
# If random_state is a RandomState object, then it is passed through.
print(X_train.shape, y_train.shape)
# Standarize the features
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
Let's plot the training data using pyplot
In [77]:
colors = ['red', 'greenyellow', 'blue']
for i in range(len(colors)):
px = X_train[:, 0][y_train == i]
py = X_train[:, 1][y_train == i]
plt.scatter(px, py, c=colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
Out[77]:
Classificação linear com SGDClassifier from s-l.
SGD = Stochastic Gradient Descent, procedimento numérico muito popular para encontrar o mínimo local de uma função (neste caso, a função de perda, que mede o quão longe cada instância é do nosso limite). O algoritmo aprenderá os coeficientes do hiperplane, minimizando a função de perda. Vamos ajustar um método de Classificação Linear aos nossos dados de treinamento e mostrar o hiperplane integrado:
In [79]:
# create the linear model classifier
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
# fit (train) the classifier
clf.fit(X_train, y_train)
# print learned coeficients
print(clf.coef_)
print(clf.intercept_)
Plotar as 3 curvas de decisão calculadas. Classe 0 é linearmente separável, Classe 1 e 2 não
In [82]:
x_min, x_max = X_train[:, 0].min() - .5, X_train[:, 0].max() + .5
y_min, y_max = X_train[:, 1].min() - .5, X_train[:, 1].max() + .5
xs = np.arange(x_min,x_max,0.5)
fig, axes = plt.subplots(1,3)
fig.set_size_inches(10,6)
for i in [0,1,2]:
axes[i].set_aspect('equal')
axes[i].set_title('Class ' + str(i) + ' versus the rest')
axes[i].set_xlabel('Sepal length')
axes[i].set_ylabel('Sepal width')
axes[i].set_xlim(x_min, x_max)
axes[i].set_ylim(y_min, y_max)
plt.sca(axes[i])
for j in range(len(colors)):
px = X_train[:, 0][y_train == j]
py = X_train[:, 1][y_train == j]
plt.scatter(px, py, c=colors[j])
ys = (-clf.intercept_[i]-xs*clf.coef_[i,0])/clf.coef_[i,1]
plt.plot(xs,ys,hold=True)
Vejamos agora como o classificador pode prever a classe de certas instâncias, dado o comprimento (length) e largura (width) da sépala:
In [85]:
print(clf.predict(scaler.transform([[4.7, 3.1]])))
print(clf.decision_function(scaler.transform([[4.7, 3.1]])))
E agora quão bom o classificador é no treinamento, medindo a acurácia:
In [87]:
from sklearn import metrics
y_train_pred = clf.predict(X_train)
print(metrics.accuracy_score(y_train, y_train_pred))
To get a better idea of the expected performance of our classifier on unseen data, that must measure accuracy on the testing set
In [90]:
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
Let's try some additional measures: Precision, Recall and F-score, and show the confusion matrix
In [91]:
print(metrics.classification_report(y_test, y_pred, target_names=iris.target_names))
print(metrics.confusion_matrix(y_test, y_pred))
Now, let's try cross-validation: divide the dataset into n parts, train on n-1 and evaluate on the remaining one; do this n times and take the mean. We will, for this, create a new classifier: a pipeline of the standarizer and the linear model. Measure the cross-validation accuracy for each fold:
In [97]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
# create a composite estimator made by a pipeline of the standarization and the linear model
clf = Pipeline([
('scaler', StandardScaler()),
('linear_model', SGDClassifier())
])
# create a k-fold croos validation iterator of k = 5 folds
cv = KFold(X.shape[0], 5, shuffle=True, random_state=33)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)
Calculate the mean and standard error of cross-validation accuracy
In [98]:
from scipy.stats import sem
def mean_score(scores):
"""Print the empirical mean score and standard error of the mean."""
return ("Mean score: {0:.3f} (+/-{1:.3f})").format(
np.mean(scores), sem(scores))
print(mean_score(scores))
In [ ]:
In [ ]:
In [ ]:
Predict the Iris flower species using only two attributes: sepal width and sepal length. This is an instance of a classification problem, where we want to assign a label (a value taken from a discrete set) to an item according to its features.
In [65]:
# from sklearn.cross_validation import train_test_split (no livro está isso que foi descontinuado)
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
In [66]:
# Get dataset with only the first two attributes
X, y = x_iris[:, :2], y_iris
In [62]:
# Split the dataset into a training and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
In [63]:
print(X_train.shape, y_train.shape)
In [64]:
# Standardize the features
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: