In [3]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline
# Import a bunch of libraries.
import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
# Set the randomizer seed so results are the same each time.
np.random.seed(0)
In [4]:
# Load the digit data either from mldata.org, or once downloaded to data_home, from disk. The data is about 53MB so this cell
# should take a while the first time your run it.
mnist = fetch_mldata('MNIST original', data_home='~/datasets/mnist')
X, Y = mnist.data, mnist.target
# Rescale grayscale values to [0,1].
X = X / 255.0
# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]
print 'data shape: ', X.shape
print 'label shape:', Y.shape
# Set some variables to hold test, dev, and training data.
test_data, test_labels = X[61000:], Y[61000:]
dev_data, dev_labels = X[60000:61000], Y[60000:61000]
train_data, train_labels = X[:60000], Y[:60000]
mini_train_data, mini_train_labels = X[:1000], Y[:1000]
In [5]:
example = mini_train_data[:1]
In [93]:
# Credit where due... some inspiration drawn from:
# https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/fig/mnist.py
# example_as_pixel_matrix():
# transforms a 784 element pixel into a 28 x 28 pixel matrix
def example_as_pixel_matrix(example):
return np.reshape(example, (-1, 28))
# add_example_to_figure():
# given an existing figure, number of rows, columns, and position,
# adds a subplot with the example to the figure
def add_example_to_figure(example,
figure,
subplot_rows,
subplot_cols,
subplot_number):
matrix = example_as_pixel_matrix(example)
subplot = figure.add_subplot(subplot_rows, subplot_cols, subplot_number)
subplot.imshow(matrix, cmap='Greys', interpolation='Nearest')
# disable tick marks
subplot.set_xticks(np.array([]))
subplot.set_yticks(np.array([]))
# plot_examples():
# given a matrix of examples (digit, example#) => example,
# plots it with digits as rows and examples as columns
def plot_examples(examples):
figure = plt.figure()
shape = np.shape(examples)
rows = shape[0]
columns = shape[1]
subplot_index = 1
for digit, examples_for_digit in enumerate(examples):
for example_index, example in enumerate(examples_for_digit):
add_example_to_figure(example,
figure,
rows,
columns,
subplot_index
)
subplot_index = subplot_index + 1
figure.tight_layout()
plt.show()
# plot_one_example():
# given an example, plots only that example, typically
# for debugging or diagnostics
def plot_one_example(example):
examples = [ [ example ] ]
plot_examples(examples)
# select_indices_of_digit():
# given an array of digit lables, selects the indices of
# labels that match a desired digit
def select_indices_of_digit(labels, digit):
return [i for i, label in enumerate(labels) if label == digit]
# take_n_from():
# code readability sugar for taking a number of elements from an array
def take_n_from(count, array):
return array[:count]
# take_n_examples_by_digit():
# given a data set of examples, a label set, and a parameter n,
# creates a matrix where the rows are the digits 0-9, and the
# columns are the first n examples of each digit
def take_n_examples_by_digit(data, labels, n):
examples = [
data[take_n_from(n, select_indices_of_digit(labels, digit))]
for digit in range(10)
]
return examples
In [7]:
indices_8 = [i for i, digit in enumerate(mini_train_labels) if digit == 8]
mini_train_labels[indices_8]
Out[7]:
In [9]:
take_n_from(10, select_indices_of_digit(mini_train_labels, 8))
Out[9]:
In [11]:
examples = take_n_examples_by_digit(mini_train_data, mini_train_labels, 10)
plot_examples(examples)
In [12]:
from sklearn.metrics import accuracy_score
# apply_k_nearest_neighbors():
# given the parameter k, training data and labels, and development data and labels,
# fit a k nearest neighbors classifier using the training data,
# test using development data, and output a report
def apply_k_nearest_neighbors(k,
training_data,
training_labels,
development_data,
development_labels):
neigh = KNeighborsClassifier(n_neighbors = k)
neigh.fit(training_data, training_labels)
predicted_labels = neigh.predict(development_data)
target_names = [ str(i) for i in range(10) ]
print '============ Classification report for k = ' + str(k) + ' ============'
print ''
print(classification_report(
development_labels,
predicted_labels,
target_names = target_names))
return accuracy_score(development_labels, predicted_labels, normalize = True)
In [13]:
def P2(k_values):
return [
apply_k_nearest_neighbors(k,
mini_train_data,
mini_train_labels,
dev_data,
dev_labels)
for k in k_values
]
k_values = [1, 3, 5, 7, 9]
P2(k_values)
Out[13]:
In [14]:
print(classification_report(dev_labels, predicted, target_names = target_names))
In [169]:
target_names = { i : str(i) for i in range(10) }
target_names
Out[169]:
In [201]:
start = time.time()
In [202]:
end = time.time()
In [203]:
end-start
Out[203]:
In [15]:
# k_nearest_neighbors_timed_accuracy():
# given the parameter k, training data and labels, and development data and labels,
# fit a k nearest neighbors classifier using the training data,
# test using development data, and return the number of examples, prediction time,
# and accuracy as a Python dictionary
def k_nearest_neighbors_timed_accuracy(k,
training_data,
training_labels,
development_data,
development_labels):
neigh = KNeighborsClassifier(n_neighbors = k)
neigh.fit(training_data, training_labels)
start = time.time()
predicted_labels = neigh.predict(development_data)
end = time.time()
examples, dimensions = np.shape(training_data)
accuracy = accuracy_score(development_labels, predicted_labels, normalize = True)
return { 'examples' : examples, 'time' : end-start, 'accuracy' : accuracy }
In [215]:
k_nearest_neighbors_timed_accuracy(1, mini_train_data, mini_train_labels, dev_data, dev_labels)
Out[215]:
In [16]:
def P3(train_sizes, accuracies):
k = 1
for train_size in train_sizes:
# sample train_size examples from the training set
current_train_data, current_train_labels = X[:train_size], Y[:train_size]
results = k_nearest_neighbors_timed_accuracy(k,
current_train_data,
current_train_labels,
dev_data,
dev_labels)
print(results)
accuracies.append(results['accuracy'])
In [17]:
train_sizes = [100, 200, 400, 800, 1600, 3200, 6400, 12800, 25000]
accuracies = [ ]
P3(train_sizes, accuracies)
In [18]:
from sklearn.linear_model import LogisticRegression
# fit_linear_regression():
# given arrays of training data sizes and corresponding accuracies,
# train and return a linear regression model for predicting accuracies
def fit_linear_regression(train_sizes, accuracies):
train_sizes_matrix = [ [ train_size ] for train_size in train_sizes ]
linear = LinearRegression()
linear.fit(train_sizes_matrix, accuracies)
return linear
# fit_logistic_regression():
# given arrays of training data sizes and corresponding accuracies,
# train and return a logistic regression model for predicting accuracies
def fit_logistic_regression(train_sizes, accuracies):
train_sizes_matrix = [ [ train_size ] for train_size in train_sizes ]
logistic = LogisticRegression()
logistic.fit(train_sizes_matrix, accuracies)
return logistic
In [19]:
def P4():
full_training_size = 60000
linear = fit_linear_regression(train_sizes, accuracies)
linear_prediction = linear.predict(full_training_size)
print('Linear model prediction for '
+ str(full_training_size) + ' : ' + str(linear_prediction[0]))
logistic = fit_logistic_regression(train_sizes, accuracies)
logistic_prediction = logistic.predict(full_training_size)
print('Logistic model prediction for '
+ str(full_training_size) + ' : ' + str(logistic_prediction[0]))
P4()
In [20]:
# train_k_nearest_neighbors():
# given the parameter k, training data and labels, and development data and labels,
# fit a k nearest neighbors classifier using the training data
def train_k_nearest_neighbors(k,
training_data,
training_labels):
neigh = KNeighborsClassifier(n_neighbors = k)
neigh.fit(training_data, training_labels)
return neigh
# most_confused():
# given a confusion matrix
# returns a sequence that comprises the two most confused digits, and errors between them
def most_confused(confusion):
rows, columns = np.shape(confusion)
worst_row, worst_column, worst_errors = 0, 1, 0
# iterate through the upper triangle, ignoring the diagonals
# confused is the sum for each pair of indices
for row in range(rows):
for column in range(row + 1, columns):
errors = confusion[row][column] + confusion[column][row]
if errors > worst_errors:
worst_row, worst_column, worst_errors = row, column, errors
return ( worst_row, worst_column, worst_errors )
# select_pairwire_error_indices():
# given a predictions vector, actual label vector, and the digits of interest
# returns an array of indices where the digits were confused
def select_pairwire_error_indices(predictions, labels, confused_low, confused_high):
error_indices = [ ]
for i, prediction in enumerate(predictions):
label = labels[i]
if ((prediction == confused_low and label == confused_high) or
(prediction == confused_high and label == confused_low)):
error_indices.append(i)
return error_indices
In [21]:
def P5():
k = 1
neigh = train_k_nearest_neighbors(k, train_data, train_labels)
development_predicted = neigh.predict(dev_data)
confusion = confusion_matrix(dev_labels, development_predicted)
confused_low, confused_high, confusion_errors = most_confused(confusion)
print('Most confused digits are: ' + str(confused_low) + ' and ' + str(confused_high)
+ ', with ' + str(confusion_errors) + ' total confusion errors')
error_indices = select_pairwire_error_indices(
development_predicted, dev_labels, confused_low, confused_high)
error_examples = [ dev_data[error_indices] ]
plot_examples(error_examples)
return confusion
P5()
Out[21]:
In [22]:
neigh = train_k_nearest_neighbors(1, train_data, train_labels)
In [23]:
development_predicted = neigh.predict(dev_data)
In [24]:
confusion = confusion_matrix(dev_labels, development_predicted)
In [25]:
confused_low, confused_high, confusion_errors = most_confused(confusion)
print('Most confused digits are: ' + str(confused_low) + ' and ' + str(confused_high)
+ ', with ' + str(confusion_errors) + ' total confusion errors')
In [26]:
error_indices = select_pairwire_error_indices(
development_predicted, dev_labels, confused_low, confused_high)
error_examples = [ dev_data[error_indices] ]
plot_examples(error_examples)
In [27]:
np.shape(example)
Out[27]:
In [28]:
error_example = dev_data[561:562]
np.shape(error_example)
plot_one_example(error_example)
In [31]:
import itertools
def blur(image):
pixel_matrix = example_as_pixel_matrix(image)
blurred_image = []
rows, columns = np.shape(pixel_matrix)
for row in range(rows):
for column in range(columns):
# take the mean of the 9-pixel neighborhood (in clause)
# but guard against running off the edges of the matrix (if clause)
value = np.mean(list(
pixel_matrix[i][j]
for i, j
in itertools.product(
range(row - 1, row + 2),
range(column - 1, column + 2)
)
if (i >= 0) and (j >= 0) and (i < rows) and (j < columns)
))
blurred_image.append(value)
return blurred_image
In [34]:
example = mini_train_data[:1]
plot_one_example(example)
plot_one_example(blur(example))
In [35]:
blurred_mini_train_data = [ blur(datum) for datum in mini_train_data ]
In [37]:
np.shape(mini_train_data)
Out[37]:
In [38]:
np.shape(blurred_mini_train_data)
Out[38]:
In [39]:
np.shape(train_data)
Out[39]:
In [32]:
blurred_training_data = [ blur(example) for example in train_data ]
np.shape(blurred_training_data)
In [316]:
np.shape(blurred_training_data)
Out[316]:
In [303]:
row, column = 1, 2
import itertools
pixel_matrix = example_as_pixel_matrix(example)
rows, columns = np.shape(pixel_matrix)
value = np.mean(list(
pixel_matrix[i][j]
for i, j
in itertools.product(
range(row - 1, row + 2),
range(column - 1, column + 2)
)
if (i >= 0) and (j >= 0) and (i < rows) and (j < columns)
))
value
np.mean(list(
pixel_matrix[i][j]
for i, j
in itertools.product(
range(row - 1, row + 2),
range(column - 1, column + 2)
)
if (i >= 0) and (j >= 0) and (i < rows) and (j < columns)
))
Out[303]:
In [314]:
blurred_example = blur(example)
plot_one_example(example)
plot_one_example(blurred_example)
In [308]:
np.shape(example_as_pixel_matrix(blurred_example))
Out[308]:
In [317]:
plot_examples(take_n_examples_by_digit(train_data, train_labels, 10))
In [1]:
blurred_training_data = [ blur(example) for example in train_data ]
plot_examples(take_n_examples_by_digit(blurred_training_data, train_labels, 10))
In [2]:
blurred_training_data = [ blur(example) for example in train_data ]
In [5]:
# binarize_example():
# Turn all pixels below 0.5 (or threshold) -> 0, greater -> 1
def binarize_example(example, threshold = 0.5):
binarized = [ 1 if value > threshold else 0 for value in example ]
return binarized
# binarize_examples():
# Apply binarization to a set of example
def binarize_examples(examples, threshold = 0.5):
binarized = [ binarize_example(example, threshold) for example in examples ]
return binarized
# ternarize_example():
# Turn all pixels below 1/3 (or threshold) -> 0, 1/3 through 2/3 -> 1, greater -> 2
def ternarize_example(example, threshold_low = 0.33333333, threshold_high = 0.66666666):
ternarized = [
0 if value < threshold_low else 1 if value < threshold_high else 2
for value in example
]
return ternarized
# ternarize_examples():
# Apply ternarization to a set of example
def ternarize_examples(examples, threshold_low = 0.33333333, threshold_high = 0.66666666):
ternarized = [
ternarize_example(example, threshold_low, threshold_high)
for example in examples
]
return ternarized
In [10]:
binarized_train_data = binarize_examples(train_data)
In [53]:
ternarized_train_data = ternarize_examples(train_data)
In [55]:
binary_naive_bayes = BernoulliNB()
binary_naive_bayes.fit(binarized_train_data, train_labels)
Out[55]:
In [64]:
ternary_naive_bayes = MultinomialNB()
ternary_naive_bayes.fit(ternarized_train_data, train_labels)
Out[64]:
In [56]:
binarized_dev_data = binarize_examples(dev_data)
In [63]:
ternarized_dev_data = ternarize_examples(dev_data)
In [57]:
binary_naive_bayes_predicted = binary_naive_bayes.predict(binarized_dev_data)
In [65]:
ternary_naive_bayes_predicted = ternary_naive_bayes.predict(ternarized_dev_data)
In [61]:
target_names = [ str(i) for i in range(10) ]
print '============ Classification report for binarized ============'
print ''
print(classification_report(
dev_labels,
binary_naive_bayes_predicted,
target_names = target_names))
print ' Accuracy score: '
print(accuracy_score(dev_labels, binary_naive_bayes_predicted, normalize = True))
In [66]:
print '============ Classification report for ternarized ============'
print ''
print(classification_report(
dev_labels,
ternary_naive_bayes_predicted,
target_names = target_names))
print ' Accuracy score: '
print(accuracy_score(dev_labels, ternary_naive_bayes_predicted, normalize = True))
In [69]:
foo = (
[ 1, 2 ] +
[3, 4] +
[5, 6]
)
foo
Out[69]:
In [4]:
def P7():
binarized_train_data = binarize_examples(train_data)
ternarized_train_data = ternarize_examples(train_data)
In [6]:
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
In [7]:
bernoulli_naive_bayes = BernoulliNB()
In [9]:
grid_search = GridSearchCV(bernoulli_naive_bayes, alphas, verbose = 3)
In [11]:
grid_search.fit(binarized_train_data, train_labels)
Out[11]:
In [13]:
grid_search.best_params_
Out[13]:
In [21]:
from sklearn.metrics import accuracy_score
In [72]:
def train_and_score_gaussian(
training_data, training_labels, development_data, development_labels):
model = GaussianNB().fit(training_data, training_labels)
predictions = model.predict(development_data)
print(accuracy_score(development_labels, predictions, normalize = True))
return model
print 'Accuracy score of Gaussian Naive Bayes (uncorrected): '
gaussian_naive_bayes = train_and_score_gaussian(train_data, train_labels, dev_data, dev_labels)
In [69]:
def transform_attempt1(pixel):
return np.log(0.1 + pixel)
vectorized_transform_attempt1 = np.vectorize(transform_attempt1)
mapped_train_data = vectorized_transform_attempt1(train_data)
mapped_dev_data = vectorized_transform_attempt1(dev_data)
print('Attempt #1 : transform each pixel with log(0.1 + pixel) ')
print(evaluate_gaussian(mapped_train_data, train_labels, mapped_dev_data, dev_labels))
In [ ]:
In [71]:
theta = gaussian_naive_bayes.theta_
for digit in range(5):
theta_figure = plt.figure()
theta_hist = plt.hist(theta[digit], bins = 100)
theta_hist_title = plt.title('Theta distribution for the digit ' + str(digit))
plt.show()
In [62]:
sigma = gaussian_naive_bayes.sigma_
for digit in range(5):
sigma_figure = plt.figure()
sigma_hist = plt.hist(theta[digit], bins = 100)
sigma_hist_title = plt.title('Sigma distribution for the digit ' + str(digit))
plt.show()
In [63]:
sigma = gaussian_naive_bayes.sigma_
for digit in range(5,10):
sigma_figure = plt.figure()
sigma_hist = plt.hist(theta[digit], bins = 100)
sigma_hist_title = plt.title('Sigma distribution for the digit ' + str(digit))
plt.show()
In [83]:
print('Issue: Many features have variance 0, ')
print('which means they "contribute" but the contribution is noise')
examples, pixels = np.shape(train_data)
def select_signal_pixel_indices(data):
indices = [ ]
examples, pixels = np.shape(data)
for pixel in range(pixels):
has_signal = False
for example in range(examples):
if data[example][pixel] > 0.0:
has_signal = True
if has_signal:
indices.append(pixel)
return indices
pixels_with_signal = select_signal_pixel_indices(train_data)
In [85]:
def select_pixels_with_signal(data, pixels_with_signal):
examples, pixels = np.shape(data)
selected = [
data[example][pixels_with_signal]
for example in range(examples)
]
return selected
signal_train_data = select_pixels_with_signal(train_data, pixels_with_signal)
signal_dev_data = select_pixels_with_signal(dev_data, pixels_with_signal)
print('Attempt #0 : only select non-0 pixels ')
evaluate_gaussian(signal_train_data, train_labels, signal_dev_data, dev_labels)
Out[85]:
In [66]:
examples, pixels = np.shape(mini_train_data)
all_pixels = np.reshape(mini_train_data, examples * pixels)
all_pixels_hist = plt.hist(all_pixels, bins = 100)
all_pixels_hist_title = plt.title('All pixels histogram (not REMOTELY Normal)')
plt.show()
print('Cannot apply straight log since log(0) is underdefined, what about log (value + 0.1)')
all_pixels_mapped = [ np.log(0.1 + pixel) for pixel in all_pixels ]
all_pixels_hist = plt.hist(all_pixels_mapped, bins = 100)
all_pixels_hist_title = plt.title('All pixels mapped log(0.1 + value) histogram')
plt.show()
In [66]:
examples, pixels = np.shape(mini_train_data)
all_pixels = np.reshape(mini_train_data, examples * pixels)
all_pixels_hist = plt.hist(all_pixels, bins = 100)
all_pixels_hist_title = plt.title('All pixels histogram (not REMOTELY Normal)')
plt.show()
print('Cannot apply straight log since log(0) is underdefined, what about log (value + 0.1)')
all_pixels_mapped = [ np.log(0.1 + pixel) for pixel in all_pixels ]
all_pixels_hist = plt.hist(all_pixels_mapped, bins = 100)
all_pixels_hist_title = plt.title('All pixels mapped log(0.1 + value) histogram')
plt.show()
In [53]:
examples, pixels = np.shape(mini_train_data)
all_pixels = np.reshape(mini_train_data, examples * pixels)
all_pixels_hist = plt.hist(all_pixels, bins = 100)
In [ ]:
In [75]:
def transform_attempt1(pixel):
return np.log(0.1 + pixel)
vectorized_transform_attempt1 = np.vectorize(transform_attempt1)
mapped_train_data = vectorized_transform_attempt1(train_data)
mapped_dev_data = vectorized_transform_attempt1(dev_data)
print('Attempt #1 : transform each pixel with log(0.1 + pixel) ')
evaluate_gaussian(mapped_train_data, train_labels, mapped_dev_data, dev_labels)
Out[75]:
In [76]:
def transform_attempt2(pixel):
return 0.0 if pixel < 0.0001 else 1.0
vectorized_transform_attempt2 = np.vectorize(transform_attempt2)
mapped_train_data = vectorized_transform_attempt2(train_data)
mapped_dev_data = vectorized_transform_attempt2(dev_data)
print('Attempt #2 : binarize all pixels with a very low threshold ')
evaluate_gaussian(mapped_train_data, train_labels, mapped_dev_data, dev_labels)
Out[76]:
In [88]:
binarized_train_data = binarize_examples(train_data)
binary_naive_bayes = BernoulliNB().fit(binarized_train_data, train_labels)
binary_naive_bayes.get_params()
Out[88]:
In [97]:
binary_naive_bayes.feature_log_prob_[0]
def generate_example(log_probabilities):
pixels = [
1.0 if np.random.rand() <= np.exp( log_probability ) else 0.0
for log_probability in log_probabilities
]
return pixels
# more than 10 x 10 gets scaled too small
def plot_10_examples(binary_naive_bayes):
per_digit_log_probabilities = binary_naive_bayes.feature_log_prob_
examples = [
[
generate_example(per_digit_log_probabilities[digit])
for example in range(10)
]
for digit in range(10)
]
plot_examples(examples)
def P10(num_examples):
binarized_train_data = binarize_examples(train_data)
binary_naive_bayes = BernoulliNB().fit(binarized_train_data, train_labels)
page = 0
while page < num_examples:
plot_10_examples(binary_naive_bayes)
page = page + 10
P10(20)
In [98]:
binarized_train_data = binarize_examples(train_data)
binary_naive_bayes = BernoulliNB(alpha = 0.0001).fit(binarized_train_data, train_labels)
In [100]:
binary_naive_bayes.feature_log_prob_
Out[100]:
In [101]:
np.shape(binary_naive_bayes.feature_log_prob_)
Out[101]:
In [106]:
binary_naive_bayes.class_log_prior_
Out[106]:
In [123]:
binary_naive_bayes.predict_proba(dev_example)
Out[123]:
In [129]:
prediction = binary_naive_bayes.predict(dev_example)[0]
In [135]:
posterior = binary_naive_bayes.predict_proba(dev_example)[0][prediction]
posterior
Out[135]:
In [144]:
label = dev_labels[0]
for i in range(1, len(buckets)):
if posterior <= buckets[i]:
total[i] = total[i] + 1
if prediction == label:
correct[i] = correct[i] + 1
continue;
In [126]:
plot_one_example(dev_example)
In [26]:
buckets = [0.5, 0.9, 0.999, 0.99999, 0.9999999, 0.999999999, 0.99999999999, 0.9999999999999, 1.0]
In [27]:
correct = [0 for i in buckets]
In [28]:
total = [0 for i in buckets]
In [34]:
buckets = [0.5, 0.9, 0.999, 0.99999, 0.9999999, 0.999999999, 0.99999999999, 0.9999999999999, 1.0]
correct = [0 for i in buckets]
total = [0 for i in buckets]
def train_binarized_bernoulli(training_data, training_labels, alpha = 0.0001):
binarized_train_data = binarize_examples(training_data)
binary_naive_bayes = BernoulliNB(alpha = alpha)
binary_naive_bayes.fit(binarized_train_data, training_labels)
return binary_naive_bayes
def find_bucket_index(buckets, posterior):
index = None
for i in range(len(buckets)):
if index == None and posterior <= buckets[i]:
index = i
return index
def score_by_posterior_buckets(
binary_naive_bayes, test_data, test_labels,
buckets, correct, total):
predictions = binary_naive_bayes.predict(test_data)
posteriors = binary_naive_bayes.predict_proba(test_data)
confidences = [
posteriors[index][predictions[index]]
for index in range(len(predictions))
]
for index, confidence in enumerate(confidences):
bucket_index = find_bucket_index(buckets, confidence)
total[bucket_index] = total[bucket_index] + 1
if predictions[index] == test_labels[index]:
correct[bucket_index] = correct[bucket_index] + 1
def P11(buckets, correct, total):
binary_naive_bayes = train_binarized_bernoulli(
train_data, train_labels)
binarized_dev_data = binarize_examples(dev_data)
score_by_posterior_buckets(binary_naive_bayes, binarized_dev_data, dev_labels,
buckets, correct, total)
P11(buckets, correct, total)
for i in range(len(buckets)):
if (total[i] > 0): accuracy = float(correct[i]) / float(total[i])
print 'p(pred) <= %.13f total = %3d accuracy = %.3f' %(buckets[i], total[i], accuracy)
In [33]:
for i in range(len(buckets)):
if (total[i] > 0): accuracy = float(correct[i]) / float(total[i])
print 'p(pred) <= %.13f total = %3d accuracy = %.3f' %(buckets[i], total[i], accuracy)
In [158]:
total
Out[158]:
In [188]:
correct
Out[188]:
In [185]:
binary_naive_bayes.predict_proba(dev_data[0])[0]
Out[185]:
In [196]:
for (i, x) in buckets:
print ('i = ' + str(i) + ' x = ' + str(x) )
In [198]:
binarized_dev_data = binarize_examples(dev_data)
In [199]:
predictions = binary_naive_bayes.predict(binarized_dev_data)
In [200]:
posteriors = binary_naive_bayes.predict_proba(binarized_dev_data)
In [6]:
binarized_train_data = binarize_examples(train_data)
In [7]:
binary_naive_bayes = BernoulliNB()
binary_naive_bayes.fit(binarized_train_data, train_labels)
Out[7]:
In [8]:
binarized_dev_data = binarize_examples(dev_data)
In [9]:
predictions = binary_naive_bayes.predict(binarized_dev_data)
In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(dev_labels, predictions, normalize = True)
Out[12]:
In [13]:
posteriors = binary_naive_bayes.predict_proba(binarized_dev_data)
In [24]:
confidences = [
posteriors[index][predictions[index]]
for index in range(len(predictions))
]
In [32]:
for index, confidence in enumerate(confidences):
label = dev_labels[index]
bucket_index = find_bucket_index(buckets, confidence)
total[bucket_index] = total[bucket_index] + 1
if predictions[index] == label:
correct[bucket_index] = correct[bucket_index] + 1