In [1]:
from sklearn.datasets import load_digits
data_set = load_digits()
Let's poke around and see what is in the data set.
In [2]:
data_set.keys()
Out[2]:
In [3]:
data_set.data
Out[3]:
Well, that is a bit hard to grok. Let's see if we can get a better view.
In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (4.0, 4.0)
def show_image(image_data):
plt.imshow(image_data, cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
show_image(data_set.images[0])
data_set.images[0]
Out[4]:
Now we have an idea of what our data looks like. It looks like they took 8x8 gray scale images, and then just concatenated all of the rows together.
In [5]:
data_set.target
Out[5]:
And each one of these data points has a label, 0 through 9.
In [6]:
half_length = len(data_set.data) // 2
train_set = {
'data': data_set.data[:half_length],
'target': data_set.target[:half_length],
}
test_set = {
'data': data_set.data[half_length:],
'target': data_set.target[half_length:]
}
In [7]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(train_set['data'], train_set['target'])
Out[7]:
We now have a classifier. Let's try it out.
In [15]:
import random
plt.rcParams['figure.figsize'] = (4.0, 4.0)
predictions = classifier.predict(test_set['data'])
the_pick = random.randrange(0, len(test_set['data']))
show_image(test_set['data'][the_pick].reshape(8, 8))
print("We predict: {0}".format(predictions[the_pick]))
In [16]:
import pandas as pd
pd.crosstab(test_set['target'], predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
Out[16]:
In [17]:
from sklearn import metrics
print("Classification report")
print(metrics.classification_report(test_set['target'], predictions))
In [77]:
plt.rcParams['figure.figsize'] = (12.0, 12.0)
precision = []
recall = []
probabilities = classifier.predict_proba(test_set['data'])
for i in range(10):
actual = [v == i for v in test_set['target']]
p, r, _ = metrics.precision_recall_curve(actual, probabilities[:, i])
precision.append(p)
recall.append(r)
for i in range(10):
plt.plot(recall[i], precision[i], label=i)
plt.legend(loc='lower left')
plt.xlim([0.0, 1.0])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()
In [18]:
import numpy as np
# Totally random data, nothing can be learned here.
rows = 1000
features = 64
data = np.random.random((rows, features))
labels = np.random.randint(0, 2, rows)
In [19]:
from sklearn.ensemble import RandomForestClassifier
# Train it on all the data, such a bad idea!
bad_classifier = RandomForestClassifier()
bad_classifier.fit(data, labels)
Out[19]:
In [20]:
probabilities = bad_classifier.predict_proba(data)
p, r, _ = metrics.precision_recall_curve(labels, probabilities[:,1])
plt.plot(r, p)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()
In [21]:
# Divide our data in half
half_length = len(data) // 2
train_data = data[:half_length]
train_labels = labels[:half_length]
test_data = data[half_length:]
test_labels = labels[half_length:]
# train on half of it
good_classifier = RandomForestClassifier()
good_classifier.fit(train_data, train_labels)
# evaluate
probabilities = good_classifier.predict_proba(test_data)
p, r, _ = metrics.precision_recall_curve(test_labels, probabilities[:,1])
plt.plot(r, p)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()
In [ ]: