First, we'll download the dataset to our local machine. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the testset 19000 labelled examples. Given these sizes, it should be possible to train models quickly on any machine.
In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
The objective of this assignment is to learn about simple data curation practices, and familiarize you with some of the data we'll be reusing later.
This notebook uses the notMNIST dataset to be used with python experiments. This dataset is designed to look like the classic MNIST dataset, while looking a little more like real data: it's a harder task, and the data is a lot less 'clean' than MNIST.
In [2]:
url = 'http://yaroslavvb.com/upload/notMNIST/'
def maybe_download(filename, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
if force or not os.path.exists(filename):
filename, _ = urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
strRawCompressedTrainSetFilename = maybe_download('notMNIST_large.tar.gz', 247336696)
strRawCompressedTestSetFilename = maybe_download('notMNIST_small.tar.gz', 8458043)
Extract the dataset from the compressed .tar.gz file. This should give you a set of directories, labelled A through J.
In [3]:
s_iNum_classes = 10
np.random.seed(133)
def maybe_extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(root) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data for %s. This may take a while. Please wait.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall()
tar.close()
data_folders = [
os.path.join(root, d) for d in sorted(os.listdir(root))
if os.path.isdir(os.path.join(root, d))]
if len(data_folders) != s_iNum_classes:
raise Exception(
'Expected %d folders, one per class. Found %d instead.' % (
s_iNum_classes, len(data_folders)))
print(data_folders)
return data_folders
print("s_strListExtractedTrainFolderNames: ")
s_strListExtractedTrainFolderNames = maybe_extract(strRawCompressedTrainSetFilename)
print("\ns_strListExtractedTestFolderNames: ")
s_strListExtractedTestFolderNames = maybe_extract(strRawCompressedTestSetFilename)
In [ ]:
######################################## SKIP THIS CELL ############################################
from IPython.display import Image
Image(filename='./notMNIST_large/A/Z2xlZXN0ZWFrLnR0Zg==.png')
Now let's load the data in a more manageable format. Since, depending on your computer setup you might not be able to fit it all in memory, we'll load each class into a separate dataset, store them on disk and curate them independently. Later we'll merge them into a single dataset of manageable size.
We'll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean and standard deviation ~0.5 to make training easier down the road.
A few images might not be readable, we'll just skip them.
In [4]:
s_iImage_size = 28 # Pixel width and height.
s_fPixel_depth = 255.0 # Number of levels per pixel.
def load_letter(folder, min_num_images):
"""Load the data for a single letter label, insuring you have at least min_num_images."""
image_files = os.listdir(folder)
#An ndarray is a (often fixed) multidimensional container of items of the same type and size
#so here, we're building a 3d array with indexes (image index, x,y), and type float32
dataset = np.ndarray(shape=(len(image_files), s_iImage_size, s_iImage_size), dtype=np.float32)
image_index = 0
#for each image in the current folder (A, B, etc)
print(folder)
for image in os.listdir(folder):
#get the full image path
image_file = os.path.join(folder, image)
try:
#read image as a bunch of floats, and normalize those floats by using pixel_depth
image_data = (ndimage.imread(image_file).astype(float) - s_fPixel_depth / 2) / s_fPixel_depth
#ensure image shape is standard
if image_data.shape != (s_iImage_size, s_iImage_size):
raise Exception('Unexpected image shape: %s' % str(image_data.shape))
#and put it in the dataset
dataset[image_index, :, :] = image_data
image_index += 1
except IOError as e:
print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
num_images = image_index
dataset = dataset[0:num_images, :, :]
if num_images < min_num_images:
raise Exception('Many fewer images than expected: %d < %d' %
(num_images, min_num_images))
print('Full dataset tensor:', dataset.shape)
print('Mean:', np.mean(dataset))
print('Standard deviation:', np.std(dataset))
return dataset
def maybe_pickle(p_strDataFolderNames, p_iMin_num_images_per_class, p_bForce=False):
dataset_names = []
#data_folders are either the train or test set. folders within those are A, B, etc
for strCurFolderName in p_strDataFolderNames:
#we will serialize those subfolders (A, B, etc), that's what pickling is
strCurSetFilename = strCurFolderName + '.pickle'
#add the name of the current pickled subfolder to the list
dataset_names.append(strCurSetFilename)
#if the pickled folder already exists, skip
if os.path.exists(strCurSetFilename) and not p_bForce:
# You may override by setting force=True.
print('%s already present - Skipping pickling.' % strCurSetFilename)
else:
#call the load_letter function def above
print('Pickling %s.' % strCurSetFilename)
dataset = load_letter(strCurFolderName, p_iMin_num_images_per_class)
try:
#and try to pickle it
with open(strCurSetFilename, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
print('Unable to save data to', set_filename, ':', e)
return dataset_names
s_strListPickledTrainFilenames = maybe_pickle(s_strListExtractedTrainFolderNames, 45000)
s_strListPickledTestFilenames = maybe_pickle(s_strListExtractedTestFolderNames, 1800)
print("\ns_strListPickledTrainFilenames: ", s_strListPickledTrainFilenames)
print("\ns_strListPickledTestFilenames: ", s_strListPickledTestFilenames)
In [ ]:
######################################## SKIP THIS CELL ############################################
#un-serialize first sub-folder of the train set
random_class_id = np.random.randint(0,s_iNum_classes)
unpickled_rnd_train_set = pickle.load(open(s_strListPickledTrainFilenames[random_class_id]))
#get xy array representing random image
random_img_id = np.random.randint(0,unpickled_rnd_train_set.shape[0])
first_img = unpickled_rnd_train_set[random_img_id,:,:]
# checking image shape, it is 28x28 pixels
# print("image %d from class %d with shape %d" %(random_img_id, random_class_id, first_img.shape))
print("image ", random_img_id, " from class ", random_class_id, " with shape ", first_img.shape)
# denormalization, but commented since doesn't change anything for imshow. The way i understand
# this, is that in these images, the each one of the 28x28 pixels is only encoding grayscale, not
# rgb. And the imshow doc says that it can handle grayscale arrays that are normalized
# (http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.imshow).
# s_fPixel_depth = 255.0 # Number of levels per pixel.
# first_img = first_img*s_fPixel_depth + s_fPixel_depth/2
# print(first_img[0,:])
plt.imshow(first_img)
plt.show()
In [ ]:
######################################## SKIP THIS CELL ############################################
#cycle through all train and test sets and count how many examples we have? Also need to check
#their mean and variance?
all_counts = np.zeros(s_iNum_classes)
all_means = np.zeros(s_iNum_classes)
all_variances = np.zeros(s_iNum_classes)
#for cur_class_id, cur_class in enumerate(unpickled_all_train_sets):
for cur_class_id in range(s_iNum_classes):
#we unpickle here a 3d array with shape: image_ids, xs, ys
unpickled_cur_train_set = pickle.load(open(s_strListPickledTrainFilenames[cur_class_id]))
print ("class ", cur_class_id)
for cur_image_id in range(len(unpickled_cur_train_set)):
# print ("image", cur_image_id)
all_counts[cur_class_id] += 1
# cur_image = unpickled_cur_train_set()
all_means[cur_class_id] += np.mean(unpickled_cur_train_set[cur_image_id])
all_variances[cur_class_id] += np.var(unpickled_cur_train_set[cur_image_id])
print ("all_counts: %d", all_counts)
all_means = np.divide(all_means, s_iNum_classes)
print ("mean of all_means: ", all_means)
all_variances = np.divide(all_variances, s_iNum_classes)
print ("mean of all_variances: ", all_variances)
Merge and prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune s_iTrainSize
as needed. The labels will be stored into a separate array of integers 0 through 9.
Also create a validation dataset for hyperparameter tuning.
In [5]:
#from p_iNb_rows and p_iImg_size:
# return dataset: an empty 3d array that is [p_iNb_rows, p_iImg_size, p_iImg_size]
# return labels: an empty vector that is [p_iNb_rows]
def make_arrays(p_iNb_rows, p_iImg_size):
if p_iNb_rows:
dataset = np.ndarray((p_iNb_rows, p_iImg_size, p_iImg_size), dtype=np.float32)
labels = np.ndarray(p_iNb_rows, dtype=np.int32)
else:
dataset, labels = None, None
return dataset, labels
#p_strListPickle_files is an array containing the filenames of the pickled data
def merge_datasets(p_strListPickledFilenames, p_iTrainSize, p_iValidSize=0):
iNum_classes = len(p_strListPickledFilenames)
#make empty arrays for validation and training sets and labels
valid_dataset, valid_labels = make_arrays(p_iValidSize, s_iImage_size)
train_dataset, train_labels = make_arrays(p_iTrainSize, s_iImage_size)
#number of items per class. // is an int division in python3, not sure in python2
iNbrOfValidItemsPerClass = p_iValidSize // iNum_classes
iNbrOfTrainItemPerClass = p_iTrainSize // iNum_classes
#figure out useful indexes for the loop
iStartValidId, iStartTrainId = 0, 0
iEndValidId, iEndTrainId = iNbrOfValidItemsPerClass, iNbrOfTrainItemPerClass
iEndListId = iNbrOfValidItemsPerClass+iNbrOfTrainItemPerClass
#for each file in p_strListPickledFilenames
for iPickleFileId, strPickleFilename in enumerate(p_strListPickledFilenames):
try:
#open the file
with open(strPickleFilename, 'rb') as f:
print (strPickleFilename)
#unpicke 3d array for current file
threeDCurLetterSet = pickle.load(f)
# let's shuffle the items to have random validation and training set.
# np.random.shuffle suffles only first dimension
np.random.shuffle(threeDCurLetterSet)
#if we asked for a validation set
if valid_dataset is not None:
#the first iNbrOfValidItemsPerClass items in letter_set are used for the validation set
threeDValidItems = threeDCurLetterSet[:iNbrOfValidItemsPerClass, :, :]
valid_dataset[iStartValidId:iEndValidId, :, :] = threeDValidItems
#label all images with the current file id
valid_labels[iStartValidId:iEndValidId] = iPickleFileId
#update ids for the train set
iStartValidId += iNbrOfValidItemsPerClass
iEndValidId += iNbrOfValidItemsPerClass
#the rest of the items are used for the training set
threeDTrainItems = threeDCurLetterSet[iNbrOfValidItemsPerClass:iEndListId, :, :]
train_dataset[iStartTrainId:iEndTrainId, :, :] = threeDTrainItems
train_labels[iStartTrainId:iEndTrainId] = iPickleFileId
iStartTrainId += iNbrOfTrainItemPerClass
iEndTrainId += iNbrOfTrainItemPerClass
except Exception as e:
print('Unable to process data from', strPickleFilename, ':', e)
raise
return valid_dataset, valid_labels, train_dataset, train_labels
#original values
# s_iTrainSize = 200000
# s_iValid_size = 10000
# s_iTestSize = 10000
s_iTrainSize = 200000
s_iValid_size = 10000
s_iTestSize = 10000
#call merge_datasets on data_sets and labels
s_threeDValidDataset, s_vValidLabels, s_threeDTrainDataset, s_vTrainLabels = merge_datasets(s_strListPickledTrainFilenames, s_iTrainSize, s_iValid_size)
_, _, s_threeDTestDataset, s_vTestLabels = merge_datasets(s_strListPickledTestFilenames, s_iTestSize)
#print shapes for data sets and their respective labels. data sets are 3d arrays with [image_id,x,y] and labels
#are [image_ids]
print('Training:', s_threeDTrainDataset.shape, s_vTrainLabels.shape)
print('Validation:', s_threeDValidDataset.shape, s_vValidLabels.shape)
print('Testing:', s_threeDTestDataset.shape, s_vTestLabels.shape)
Next, we'll randomize the data. It's important to have the labels well shuffled for the training and test distributions to match.
In [6]:
def randomize(p_3dDataset, p_vLabels):
#with int x as parameter, np.random.permutation returns a random permutation of np.arange(x)
vPermutation = np.random.permutation(p_vLabels.shape[0])
threeDShuffledDataset = p_3dDataset[vPermutation,:,:]
threeDShuffledLabels = p_vLabels [vPermutation]
return threeDShuffledDataset, threeDShuffledLabels
s_threeDTrainDataset, s_vTrainLabels = randomize(s_threeDTrainDataset, s_vTrainLabels)
s_threeDTestDataset, s_vTestLabels = randomize(s_threeDTestDataset, s_vTestLabels)
s_threeDValidDataset, s_vValidLabels = randomize(s_threeDValidDataset, s_vValidLabels)
print(s_threeDTrainDataset.shape)
print(s_threeDTestDataset.shape)
print(s_threeDValidDataset.shape)
In [ ]:
######################################## SKIP THIS CELL ############################################
#cycle through train, validation, and test sets to count how many items we have for each label, and calculate
#their mean and variance
s_vAllShuffledMeans = np.zeros(3)
s_vAllShuffledVars = np.zeros(3)
for iCurTrainingImageId in range(s_threeDTrainDataset.shape[0]):
s_vAllShuffledMeans[0] += np.mean(s_threeDTrainDataset[iCurTrainingImageId]) / s_threeDTrainDataset.shape[0]
s_vAllShuffledVars[0] += np.var(s_threeDTrainDataset[iCurTrainingImageId]) / s_threeDTrainDataset.shape[0]
print ("TRAIN mean: ", s_vAllShuffledMeans[0], "\t variance:", s_vAllShuffledVars[0])
for iCurTestImageId in range(s_threeDTestDataset.shape[0]):
s_vAllShuffledMeans[1] += np.mean(s_threeDTestDataset[iCurTestImageId]) / s_threeDTestDataset.shape[0]
s_vAllShuffledVars[1] += np.var(s_threeDTestDataset[iCurTestImageId]) / s_threeDTestDataset.shape[0]
print ("TEST mean: ", s_vAllShuffledMeans[1], "\t variance:", s_vAllShuffledVars[1])
for iCurValidImageId in range(s_threeDValidDataset.shape[0]):
s_vAllShuffledMeans[2] += np.mean(s_threeDValidDataset[iCurValidImageId]) / s_threeDValidDataset.shape[0]
s_vAllShuffledVars[2] += np.var(s_threeDValidDataset[iCurValidImageId]) / s_threeDValidDataset.shape[0]
print ("VALID mean: ", s_vAllShuffledMeans[2], "\t variance:", s_vAllShuffledVars[2])
Finally, let's save the data for later reuse:
In [7]:
pickle_file = 'notMNIST.pickle'
try:
f = open(pickle_file, 'wb')
save = {
'train_dataset': s_threeDTrainDataset,
'train_labels': s_vTrainLabels,
'valid_dataset': s_threeDValidDataset,
'valid_labels': s_vValidLabels,
'test_dataset': s_threeDTestDataset,
'test_labels': s_vTestLabels,
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', pickle_file, ':', e)
raise
In [8]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)
By construction, this dataset might contain a lot of overlapping samples, including training data that's also contained in the validation and test set! Overlap between training and test can skew the results if you expect to use your model in an environment where there is never an overlap, but are actually ok if you expect to see training samples recur when you use it. Measure how much overlap there is between training, validation and test samples.
Optional questions:
In [ ]:
######################################## SKIP THIS CELL ############################################
# all_doubles = np.zeros(2)
# for iCurTrainImageId in range(s_threeDTrainDataset.shape[0]):
# if iCurTrainImageId % 10 == 0:
# print (iCurTrainImageId)
# for iCurTestImageId in range(s_threeDTestDataset.shape[0]):
# if np.array_equal(s_threeDTrainDataset[iCurTrainImageId], s_threeDTestDataset[iCurTestImageId]):
# all_doubles[0] += 1
# for iCurValidImageId in range(s_threeDValidDataset.shape[0]):
# if np.array_equal(s_threeDTrainDataset[iCurTrainImageId], s_threeDValidDataset[iCurValidImageId]):
# all_doubles[1] += 1
# print(all_doubles[0])
# print(all_doubles[1])
#eythian solution, with my edits
all_doubles = np.zeros(2)
s_threeDTrainDataset.flags.writeable=False #this is probably optional
s_threeDTestDataset.flags.writeable=False
dup_dict={} #using {} declares a dictionary. this dictionnary will store pairs of keys (image hash) and values (train_data image id)
for idx,img in enumerate(s_threeDTrainDataset):
h = hash(img.data) #hash returns a hash value for its argument. equal numerical arguments produce the same hash value
#'h in dup_dict' tests whether the dictionnary contains the h key, I assume this is very fast
if h in dup_dict: # and (s_threeDTrainDataset[dup_dict[h]].data == img.data): #the second part of this is probably redundant...
#print ('Duplicate image: %d matches %d' % (idx, dup_dict[h]))
all_doubles[0] += 1
dup_dict[h] = idx
for idx,img in enumerate(s_threeDTestDataset):
h = hash(img.data)
if h in dup_dict: # and (s_threeDTrainDataset[dup_dict[h]].data == img.data): #vb commented this last part, it doesn't do anything
#print ('Test image %d is in the training set' % idx)
all_doubles[1] += 1
print(all_doubles[0])
print(all_doubles[1])
Let's get an idea of what an off-the-shelf classifier can give you on this data. It's always good to check that there is something to learn, and that it's a problem that is not so trivial that a canned solution solves it.
Train a simple model on this data using 50, 100, 1000 and 5000 training samples. Hint: you can use the LogisticRegression model from sklearn.linear_model.
Optional question: train an off-the-shelf model on all the data!
In [9]:
### taking inspiration from http://scikit-learn.org/stable/auto_examples/calibration/plot_compare_calibration.html#example-calibration-plot-compare-calibration-py
from sklearn import datasets
from sklearn.calibration import calibration_curve
train_samples = 100 # number of samples used for training
test_samples = 50 #number of samples for test
#training patterns. x is input pattern, y is target pattern or label
X_train = s_threeDTrainDataset[:train_samples]
#fit function below expects to have a vector as the second dimension, not an array
X_train = X_train.reshape([X_train.shape[0],X_train.shape[1]*X_train.shape[2]])
y_train = s_vTrainLabels[:train_samples]
#test patterns
X_test = s_threeDTestDataset[:test_samples]
X_test = X_test.reshape([X_test.shape[0],X_test.shape[1]*X_test.shape[2]])
y_test = s_vTestLabels[:test_samples]
# Create classifier
lr = LogisticRegression()
#create plots
plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))
ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
#try to fit the training data
lr.fit(X_train, y_train)
#assess how confident (how probable it is correct) the model is at predicting test classifications
prob_pos = lr.predict_proba(X_test)[:, 1]
#fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10)
#ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % (name, ))
ax2.hist(prob_pos, range=(0, 1), bins=10, label='Logistic', histtype="step", lw=2)
# ax1.set_ylabel("Fraction of positives")
# ax1.set_ylim([-0.05, 1.05])
# ax1.legend(loc="lower right")
# ax1.set_title('Calibration plots (reliability curve)')
ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)
plt.tight_layout()
plt.show()
In [19]:
########################### SKIP; ORIGINAL LOGISTIC CODE #################################
print(__doc__)
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD Style.
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
#
X, y = datasets.make_classification(n_samples=100000, n_features=20, n_informative=2, n_redundant=2)
train_samples = 100 # Samples used for training the models
X_train = X[:train_samples]
X_test = X[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]
# Create classifiers
lr = LogisticRegression()
# gnb = GaussianNB()
# svc = LinearSVC(C=1.0)
# rfc = RandomForestClassifier(n_estimators=100)
###############################################################################
# Plot calibration plots
plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))
ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(lr, 'Logistic')]:
# (gnb, 'Naive Bayes'),
# (svc, 'Support Vector Classification'),
# (rfc, 'Random Forest')]:
clf.fit(X_train, y_train)
if hasattr(clf, "predict_proba"):
prob_pos = clf.predict_proba(X_test)[:, 1]
else: # use decision function
prob_pos = clf.decision_function(X_test)
prob_pos = \
(prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
fraction_of_positives, mean_predicted_value = \
calibration_curve(y_test, prob_pos, n_bins=10)
ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
label="%s" % (name, ))
ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
histtype="step", lw=2)
ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots (reliability curve)')
ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)
plt.tight_layout()
plt.show()
In [31]:
########################### SKIP; ORIGINAL LOGISTIC CODE FOR 10 CLASSES #################################
print(__doc__)
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD Style.
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
X, y = datasets.make_classification(n_samples=100000, n_features=20, n_informative=2, n_redundant=2)
train_samples = 100 # Samples used for training the models
X_train = X[:train_samples]
X_test = X[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]
# Create classifiers
lr = LogisticRegression()
# gnb = GaussianNB()
# svc = LinearSVC(C=1.0)
# rfc = RandomForestClassifier(n_estimators=100)
###############################################################################
# Plot calibration plots
plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))
ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(lr, 'Logistic')]:
# (gnb, 'Naive Bayes'),
# (svc, 'Support Vector Classification'),
# (rfc, 'Random Forest')]:
clf.fit(X_train, y_train)
if hasattr(clf, "predict_proba"):
prob_pos = clf.predict_proba(X_test)[:, 1]
else: # use decision function
prob_pos = clf.decision_function(X_test)
prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10)
ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % (name, ))
ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, histtype="step", lw=2)
ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots (reliability curve)')
ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)
plt.tight_layout()
plt.show()
In [32]:
################# SOMEONE ELSE'S CODE ##############################
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
def forum(algo, ntrain, ntest):
# X_train = s_threeDTrainDataset[:train_samples]
# X_train = X_train.reshape([X_train.shape[0],X_train.shape[1]*X_train.shape[2]])
# y_train = s_vTrainLabels[:train_samples]
wh = s_threeDTrainDataset.shape[1] * s_threeDTrainDataset.shape[2]
X = s_threeDTrainDataset[:ntrain].reshape(ntrain, wh)
Xtest = s_threeDTestDataset[:ntest].reshape(ntest, wh)
Y = s_vTrainLabels[:ntrain]
Ytest = s_vTestLabels[:ntest]
t0 = time.time()
algo.fit(X, Y)
score = algo.score(Xtest, Ytest) * 100
elapsed = time.time() - t0
print('{} score: {:.1f}% under {:.2f}s'.format(type(algo), score, elapsed))
forum(KNeighborsClassifier(), ntrain=50000, ntest=1000)
forum(LogisticRegression(C=10.0, penalty='l1', multi_class='ovr', tol=0.01), ntrain=50000, ntest=1000)
In [ ]: