In [1]:
import matplotlib.pyplot as plt
import numpy as np
import scipy
import sklearn as sk
%matplotlib inline
...compte le nombre d'instances de mots. La représentation dense nous montre le vecteur de critères correspondant à chaque phrase dans corpus. Sinon, la représentation est creuse.
Essaiez avec le corpus entier pour voir ce que donne un document plus important.
À essayer et expliquer :
corpus_encoded.shapecorpus_encoded[0].todense()corpus_encoded[0].data et print(corpus_encoded[0]).corpus_encoded.todense()[0]vectorizer.inverse_transform(corpus_encoded.todense()[0])vectorizer.transform('The dog runs quickly towards the cat.')vectorizer.transform(['The dog runs quickly towards the cat.'])vectorizer2 = CountVectorizer(binary=True)
In [2]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
"Il est nuit. La cabane est pauvre, mais bien close.",
"Le logis est plein d'ombre et l'on sent quelque chose",
"Qui rayonne à travers ce crépuscule obscur.",
"Des filets de pêcheur sont accrochés au mur.",
"Au fond, dans l'encoignure où quelque humble vaisselle",
"Aux planches d'un bahut vaguement étincelle,",
"On distingue un grand lit aux longs rideaux tombants.",
"Tout près, un matelas s'étend sur de vieux bancs,",
"Et cinq petits enfants, nid d'âmes, y sommeillent",
"La haute cheminée où quelques flammes veillent",
"Rougit le plafond sombre, et, le front sur le lit,",
"Une femme à genoux prie, et songe, et pâlit.",
"C'est la mère. Elle est seule. Et dehors, blanc d'écume,",
"Au ciel, aux vents, aux rocs, à la nuit, à la brume,",
"Le sinistre océan jette son noir sanglot.",
]
corpus2 = corpus[0:2]
vectorizer = CountVectorizer()
corpus_encoded = vectorizer.fit_transform(corpus2)
print(corpus_encoded.todense())
print('----------------------------------------------------------------')
print(vectorizer.vocabulary_)
In [3]:
from sklearn.metrics.pairwise import euclidean_distances
print(euclidean_distances(corpus_encoded[0], corpus_encoded[1]))
print(euclidean_distances(corpus_encoded[0], corpus_encoded[0]))
jour = vectorizer.transform(['Il est jour. La cabane est pauvre, mais bien close.'])
print(euclidean_distances(corpus_encoded[0], jour))
In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
print(corpus2[0])
print(corpus2[1])
print(tfidf_vectorizer.fit_transform(corpus2).todense())
print('--------------------------------------------------------------')
print(vectorizer.fit_transform(corpus2).todense())
In [5]:
from sklearn.feature_extraction.text import HashingVectorizer
hash_vectorizer = HashingVectorizer(n_features=6, norm=None)
corpus1 = corpus[:1]
print(corpus1)
print(hash_vectorizer.transform(corpus1).todense())
# Trouvons les indexes non-zéro.
scipy.sparse.find(hash_vectorizer.transform(corpus1))
Out[5]:
In [6]:
from sklearn import datasets
digits = datasets.load_digits()
print('Digit:', digits.target[0])
print(digits.images[0])
print('Feature vector:\n', digits.images[0].reshape(-1, 64))
plt.figure(1, figsize=(3, 3))
plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
Ici nous prenons un exemple de sklearn. Nous chargeons toutes les images du dataset digits, nous construisons un classifieur SVM (à venir, pour l'instant, c'est juste un classifieur), et nous apprenons comment classifier les chiffres sur la première moitié des données. Puis nous testons notre classifieur avec la deuxième moitié, où nous savons toujours le target (vérité, ground truth).
In [12]:
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
# License: BSD 3 clause
# Standard scientific Python imports
import matplotlib.pyplot as plt
import math
# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics
# The digits dataset
digits = datasets.load_digits()
# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 3 images, stored in the `images` attribute of the
# dataset. If we were working from image files, we could load them using
# pylab.imread. Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
plt.subplot(2, 4, index + 1)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Training: %i' % label)
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# We learn the digits on the first half of the digits
num_training = int(math.floor(n_samples / 2))
num_test = int(math.ceil(n_samples / 2))
classifier.fit(data[:num_training], digits.target[:num_training])
# Now predict the value of the digit on the second half:
expected = digits.target[num_test:]
predicted = classifier.predict(data[num_test:])
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
images_and_predictions = list(zip(digits.images[num_test:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
plt.subplot(2, 4, index + 5)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Prediction: %i' % prediction)
plt.show()
In [8]:
from skimage.filters import roberts, sobel, scharr, prewitt
from skimage.color import rgb2gray
from skimage.data import camera
import skimage.io as io
# Au choix :
#image = camera()
image = rgb2gray(io.imread('victor.jpg'))
edge_roberts = roberts(image)
edge_sobel = sobel(image)
fig, (ax0, ax1) = plt.subplots(ncols=2)
ax0.imshow(edge_roberts, cmap=plt.cm.gray)
ax0.set_title('Roberts Edge Detection')
ax0.axis('off')
ax1.imshow(edge_sobel, cmap=plt.cm.gray)
ax1.set_title('Sobel Edge Detection')
ax1.axis('off')
plt.tight_layout()
Et maintenant procédons à la détection de coins (corners).
In [9]:
from skimage.feature import corner_harris, corner_peaks
from skimage.color import rgb2gray
from skimage.exposure import equalize_hist
def show_corners(corners, image):
"""Show corners on image."""
fig = plt.figure()
plt.gray()
plt.imshow(image)
y_corner, x_corner = zip(*corners)
plt.plot(x_corner, y_corner, 'or')
plt.xlim(0, image.shape[1])
plt.ylim(image.shape[0], 0)
fig.set_size_inches(np.array(fig.get_size_inches()) * 1.5)
plt.show()
victor = io.imread('victor.jpg')
plt.imshow(victor)
victor = equalize_hist(rgb2gray(victor))
corners = corner_peaks(corner_harris(victor), min_distance=2)
show_corners(corners, victor)
In [10]:
import mahotas as mh
from mahotas.features import surf
image = mh.imread('victor.jpg', as_grey=True)
print('The first SURF descriptor:\n{img}'.format(img=surf.surf(image)[0]))
print('Extracted {num} SURF descriptors'.format(num=len(surf.surf(image))))
In [ ]:
In [11]:
from sklearn import preprocessing
X = np.random.rand(4,4) * 100
print(X)
print('\n')
print(preprocessing.scale(X))