Class 4: Classification and Regression
In this class session we will look at some visualizations for all three.
These are exactly the same feature vector encoding functions from Class 3. They must be defined for this class as well. For more information, refer to class 3.
In [5]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df,name):
dummies = pd.get_dummies(df[name])
for x in dummies.columns:
dummy_name = "{}-{}".format(name,x)
df[dummy_name] = dummies[x]
df.drop(name, axis=1, inplace=True)
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df,name):
le = preprocessing.LabelEncoder()
df[name] = le.fit_transform(df[name])
return le.classes_
# Encode a numeric column as zscores
def encode_numeric_zscore(df,name,mean=None,sd=None):
if mean is None:
mean = df[name].mean()
if sd is None:
sd = df[name].std()
df[name] = (df[name]-mean)/sd
# Convert all missing values in the specified column to the median
def missing_median(df, name):
med = df[name].median()
df[name] = df[name].fillna(med)
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df,target):
result = []
for x in df.columns:
if x != target:
result.append(x)
# find out the type of the target column. Is it really this hard? :(
target_type = df[target].dtypes
target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
# Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
if target_type in (np.int64, np.int32):
# Classification
return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.int32)
else:
# Regression
return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.float32)
# Nicely formatted time string
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
This class will introduce 3 different visualizations that can be used with the two different classification type neural networks and regression neural networks.
The code used to produce these visualizations is shown here:
In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(names))
plt.xticks(tick_marks, names, rotation=45)
plt.yticks(tick_marks, names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
fpr, tpr, _ = roc_curve(y_test, pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
# Plot a lift curve. pred - the predictions, y - the expected output.
def chart_regression(pred,y):
t = pd.DataFrame({'pred' : pred.flatten(), 'y' : y_test.flatten()})
t.sort_values(by=['y'],inplace=True)
a = plt.plot(t['y'].tolist(),label='expected')
b = plt.plot(t['pred'].tolist(),label='prediction')
plt.ylabel('output')
plt.legend()
plt.show()
Binary classification is used to create a model that classifies between only two classes. These two classes are often called "positive" and "negative". Consider the following program that uses the wcbreast_wdbc dataset to classify if a breast tumor is cancerous (malignant) or not (benign). The iris dataset is not binary, because there are three classes (3 types of iris).
In [7]:
import os
import pandas as pd
from sklearn.cross_validation import train_test_split
import tensorflow.contrib.learn as skflow
import numpy as np
from sklearn import metrics
path = "./data/"
filename = os.path.join(path,"wcbreast_wdbc.csv")
df = pd.read_csv(filename,na_values=['NA','?'])
# Encode feature vector
df.drop('id',axis=1,inplace=True)
encode_numeric_zscore(df,'mean_radius')
encode_text_index(df,'mean_texture')
encode_text_index(df,'mean_perimeter')
encode_text_index(df,'mean_area')
encode_text_index(df,'mean_smoothness')
encode_text_index(df,'mean_compactness')
encode_text_index(df,'mean_concavity')
encode_text_index(df,'mean_concave_points')
encode_text_index(df,'mean_symmetry')
encode_text_index(df,'mean_fractal_dimension')
encode_text_index(df,'se_radius')
encode_text_index(df,'se_texture')
encode_text_index(df,'se_perimeter')
encode_text_index(df,'se_area')
encode_text_index(df,'se_smoothness')
encode_text_index(df,'se_compactness')
encode_text_index(df,'se_concavity')
encode_text_index(df,'se_concave_points')
encode_text_index(df,'se_symmetry')
encode_text_index(df,'se_fractal_dimension')
encode_text_index(df,'worst_radius')
encode_text_index(df,'worst_texture')
encode_text_index(df,'worst_perimeter')
encode_text_index(df,'worst_area')
encode_text_index(df,'worst_smoothness')
encode_text_index(df,'worst_compactness')
encode_text_index(df,'worst_concavity')
encode_text_index(df,'worst_concave_points')
encode_text_index(df,'worst_symmetry')
encode_text_index(df,'worst_fractal_dimension')
diagnosis = encode_text_index(df,'diagnosis')
num_classes = len(diagnosis)
# Create x & y for training
# Create the x-side (feature vectors) of the training
x, y = to_xy(df,'diagnosis')
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=42)
# Create a deep neural network with 3 hidden layers of 10, 20, 10
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=num_classes,
steps=10000)
# Early stopping
early_stop = skflow.monitors.ValidationMonitor(x_test, y_test,
early_stopping_rounds=200, print_steps=50, n_classes=num_classes)
# Fit/train neural network
classifier.fit(x_train, y_train, early_stop)
# Measure accuracy
score = metrics.accuracy_score(y, classifier.predict(x))
print("Final accuracy: {}".format(score))
The confusion matrix is a common visualization for both binary and larger classification problems. Often a model will have difficulty differentiating between two classes. For example, a neural network might be really good at telling the difference between cats and dogs, but not so good at telling the difference between dogs and wolves. The following code generates a confusion matrix:
In [8]:
import numpy as np
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
pred = classifier.predict(x_test)
# Compute confusion matrix
cm = confusion_matrix(y_test, pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, diagnosis)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, diagnosis, title='Normalized confusion matrix')
plt.show()
The above two confusion matrixes show the same network. The bottom (normalized) is the type you will normally see. Notice the two labels. The label "B" means benign (no cancer) and the label "M" means malignant (cancer). The left-right (x) axis are the predictions, the top-bottom) are the expected outcomes. A perfect model (that never makes an error) has a dark blue diagonal that runs from top-left to bottom-right.
To read, consider the top-left square. This square indicates "true labeled" of B and also "predicted label" of B. This is good! The prediction matched the truth. The blueness of this box represents how often "B" is classified correct. It is not darkest blue. This is because the square to the right(which is off the perfect diagonal) has some color. This square indicates truth of "B" but prediction of "M". The white square, at the bottom-left, indicates a true of "M" but predicted of "B". The whiteness indicates this rarely happens.
Your conclusion from the above chart is that the model sometimes classifies "B" as "M" (a false negative), but never mis-classifis "M" as "B". Always look for the dark diagonal, this is good!
ROC curves can be a bit confusing. However, they are very common. It is important to know how to read them. Even their name is confusing. Do not worry about their name, it comes from electrical engineering (EE).
Binary classification is common in medical testing. Often you want to diagnose if someone has a disease. This can lead to two types of errors, know as false positives and false negatives:
Types of errors:
Neural networks classify in terms of probbility of it being positive. However, at what probability do you give a positive result? Is the cutoff 50%? 90%? Where you set this cutoff is called the threshold. Anything above the cutoff is positive, anything below is negative. Setting this cutoff allows the model to be more sensative or specific:
The following shows a more sensitive cutoff:
An ROC curve measures how good a model is regardless of the cutoff. The following shows how to read a ROC chart:
The following code shows an ROC chart for the breast cancer neural network. The area under the curve (AUC) is also an important measure. The larger the AUC, the better.
In [10]:
pred = classifier.predict_proba(x_test)
pred = pred[:,1] # Only positive cases
# print(pred[:,1])
plot_roc(pred,y_test)
In [11]:
import os
import pandas as pd
from sklearn.cross_validation import train_test_split
import tensorflow.contrib.learn as skflow
import numpy as np
path = "./data/"
filename = os.path.join(path,"iris.csv")
df = pd.read_csv(filename,na_values=['NA','?'])
# Encode feature vector
encode_numeric_zscore(df,'petal_w')
encode_numeric_zscore(df,'petal_l')
encode_numeric_zscore(df,'sepal_w')
encode_numeric_zscore(df,'sepal_l')
species = encode_text_index(df,"species")
num_classes = len(species)
# Create x & y for training
# Create the x-side (feature vectors) of the training
x, y = to_xy(df,'species')
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=45)
# as much as I would like to use 42, it gives a perfect result, and a boring confusion matrix!
# Create a deep neural network with 3 hidden layers of 10, 20, 10
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=num_classes,
steps=10000)
# Early stopping
early_stop = skflow.monitors.ValidationMonitor(x_test, y_test,
early_stopping_rounds=200, print_steps=50, n_classes=num_classes)
# Fit/train neural network
classifier.fit(x_train, y_train, early_stop)
Out[11]:
In [12]:
import numpy as np
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
pred = classifier.predict(x_test)
# Compute confusion matrix
cm = confusion_matrix(y_test, pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, species)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, species, title='Normalized confusion matrix')
plt.show()
See the strong diagonal? Iris is easy. See the light blue near the bottom? Sometimes virginica is confused for versicolor.
In [13]:
import tensorflow.contrib.learn as skflow
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
path = "./data/"
filename_read = os.path.join(path,"auto-mpg.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])
# create feature vector
missing_median(df, 'horsepower')
df.drop('name',1,inplace=True)
encode_numeric_zscore(df, 'horsepower')
encode_numeric_zscore(df, 'weight')
encode_numeric_zscore(df, 'cylinders')
encode_numeric_zscore(df, 'displacement')
encode_numeric_zscore(df, 'acceleration')
encode_text_dummy(df, 'origin')
# Encode to a 2D matrix for training
x,y = to_xy(df,['mpg'])
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=42)
# Create a deep neural network with 3 hidden layers of 50, 25, 10
regressor = skflow.TensorFlowDNNRegressor(hidden_units=[50, 25, 10], steps=5000)
# Early stopping
early_stop = skflow.monitors.ValidationMonitor(x_test, y_test,
early_stopping_rounds=200, print_steps=50)
# Fit/train neural network
regressor.fit(x_train, y_train, early_stop)
Out[13]:
In [14]:
pred = regressor.predict(x_test)
chart_regression(pred,y_test)
To generate a lift chart, perform the following activities:
Reading a lift chart:
In [ ]: