In [138]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve
In [2]:
mnist = fetch_mldata('MNIST original')
mnist
Out[2]:
In [3]:
X, y = mnist['data'], mnist['target']
In [30]:
# dimensions of the data. the 784 represents the feature columns while 70000 represents the number of samples
# the original images are 28x28 pixels which becomes 784 when the images are vectorized (28x28=784)
X.shape
Out[30]:
In [31]:
# the labels that are assigned to each image
y.shape
Out[31]:
In [72]:
# curious to see what the raw data looks like
# grabbed one sample vector and reshaped it to its original 28x28 dimensions
digit = X[16081]
digit_img = digit.reshape(28,28)
digit_img
Out[72]:
In [73]:
# plot of the sample image as well as printing the corresponding label
# it's a 2 (:
plt.imshow(digit_img,cmap=matplotlib.cm.binary,
interpolation='nearest')
plt.axis('off')
plt.show()
y[16001]
Out[73]:
In [60]:
# split the MNIST test set into train and test for features and labels
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
In [61]:
# shuffle the dataset to ensure that the algorithm gets the imgages in a random order
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
In [62]:
shuffle_index
Out[62]:
In [63]:
# train on a subset of the data, in this case the #5
# first step is to create the target vectors
y_train_2 = (y_train == 2)
y_test_2 = (y_test == 2)
In [64]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_2)
Out[64]:
In [74]:
sgd_clf.predict([digit])
Out[74]:
In [80]:
cross_val_score(sgd_clf, X_train, y_train, cv=5, scoring='accuracy')
Out[80]:
In [103]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_2, cv=3)
In [104]:
confusion_matrix(y_train, y_train_pred)
Out[104]:
In [113]:
precision_score(y_train_2, y_train_pred, average='binary')
Out[113]:
In [117]:
recall_score(y_train_2, y_train_pred, average='binary')
Out[117]:
In [119]:
f1_score(y_train_2, y_train_pred)
Out[119]:
In [129]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_2,
cv=3, method='decision_function')
precisions, recalls, thresholds = precision_recall_curve(y_train_2, y_scores)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label='Precision')
plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Percentage')
plt.legend(loc='upper left')
plt.ylim([0,1])
In [130]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
In [131]:
sgd_clf.fit(X_train, y_train)
Out[131]:
In [132]:
sgd_clf.predict([digit])
Out[132]:
In [133]:
digit_scores = sgd_clf.decision_function([digit])
digit_scores
Out[133]:
In [135]:
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_clf.predict([digit])
Out[135]:
In [136]:
forest_clf.predict_proba([digit])
Out[136]:
In [137]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')
Out[137]:
In [139]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')
Out[139]:
in progress, page 96
In [ ]: