In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import sklearn.cross_validation
c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
def points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light,
cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
h = .02
X=np.concatenate((Xtr, Xte))
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
#plt.figure(figsize=(10,6))
if zfunc:
p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z=zfunc(p0, p1)
else:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
ZZ = Z.reshape(xx.shape)
if mesh:
plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)
if predicted:
showtr = clf.predict(Xtr)
showte = clf.predict(Xte)
else:
showtr = ytr
showte = yte
ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold,
s=psize, alpha=alpha,edgecolor="k")
# and testing points
ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold,
alpha=alpha, marker="s", s=psize+10)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
return ax,xx,yy
def points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light,
cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):
ax,xx,yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False,
colorscale=colorscale, cdiscrete=cdiscrete,
psize=psize, alpha=alpha, predicted=True)
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)
cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)
plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14, axes=ax)
return ax
We'll use a dataset of heights and weights of males and females to hone our understanding of classifiers. We load the data into a dataframe and plot it.
In [2]:
dflog = pd.read_csv("data/01_heights_weights_genders.csv")
dflog.head()
Out[2]:
In [3]:
# Checkup Exercise Set I:
# Create a scatter plot of Weight vs. Height
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.scatter(dflog.Weight, dflog.Height)
plt.xlabel("Weight")
plt.ylabel("Height")
plt.title("Relationship between Weight and Height")
Out[3]:
In [4]:
# Checkup Exercise Set I:
# Color the points differently by Gender.
plt.scatter(dflog.Weight, dflog.Height, c=['Blue' if gender == 'Male' else 'Pink' for gender in dflog['Gender']])
Out[4]:
First, we try a basic Logistic Regression:
In [5]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(dflog[['Height','Weight']].values,
(dflog.Gender == "Male").values,random_state=5)
clf = LogisticRegression()
# Fit the model on the training data.
clf.fit(Xlr, ylr)
# Print the accuracy score of the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))
We use the following cv_score
function to perform K-fold cross-validation and apply a scoring function to each test fold. In this incarnation we use accuracy score as the default scoring function.
In [6]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
def cv_score(clf, x, y, score_func=accuracy_score):
result = 0
nfold = 5
for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
clf.fit(x[train], y[train]) # fit
result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
return result / nfold # average
Below is an example of using the cv_score
function for a basic logistic regression model without regularization.
In [7]:
# First, we try a basic Logistic Regression:
# Split the data into a training and test set
# Train on the training set, and test for accuracy on the testing set
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
Xlr, Xtestlr, ylr, ytestlr = train_test_split(dflog[['Height','Weight']].values,
(dflog.Gender=="Male").values,random_state=5)
clf = LogisticRegression()
clf.fit(Xlr,ylr)
Out[7]:
In [8]:
#the grid of parameters to search over
Cs = [0.001, 0.1, 1, 10, 100]
# your turn
# Checkup Exercise Set II
# Find the best model parameters based only on training set.
# For each C:
# 1) Create a logistic regression model with that value of C
# 2) Find the average score for this model using the cv_score
# function only on the training set (Xlr, ylr)
# 3) Pick the C with the highest average score
results = []
max_score = 0
for c in Cs:
clf = LogisticRegression(C=c)
clf.fit(Xlr, ylr)
score = accuracy_score(clf.predict(Xlr),ylr)
print("For c value of %f score is: %f" % (c, score))
if (score > max_score):
max_score = score
best_c = c
print("Best c value is: %f with a score of: %f" % (best_c, max_score))
In [9]:
# your turn
# Checkup Exercise Set III
# 1) Use the C obtained from procedure above and train
# a Logistic Regression on the training data.
clf = LogisticRegression(C=best_c)
clf.fit(Xlr, ylr)
# 2) Calculate the accurace on the test data.
accuracy = accuracy_score(clf.predict(Xtestlr), ytestlr)
print(accuracy)
In [27]:
# your turn
# Checkup Exercise Set IV
# 1) Use GridSearchCV to find the best model over the training set
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
# GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise')[source]¶
param_grid = {'C': [0.001, 0.1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(C=1), param_grid=param_grid)
clf.fit(Xlr, ylr)
print("Grid scores:")
for params, mean_score, scores in clf.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() * 2, params))
print("Accuracy is: %f" % accuracy_score(clf.predict(Xtestlr), ytestlr))
print()
print("Classification report:")
print()
print("The best model of training set based on GridSearchCV:")
print()
y_true, y_pred = ytestlr, clf.predict(Xtestlr)
print(classification_report(y_true, y_pred))
In [18]:
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
gs = sklearn.grid_search.GridSearchCV(clf, param_grid=parameters, cv=n_folds)
gs.fit(Xtrain, ytrain)
print("BEST PARAMS", gs.best_params_)
best = gs.best_estimator_
return best
In [19]:
from sklearn.cross_validation import train_test_split
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8):
subdf=indf[featurenames]
if standardize:
subdfstd=(subdf - subdf.mean())/subdf.std()
else:
subdfstd=subdf
X=subdfstd.values
y=(indf[targetname].values==target1val)*1
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size)
clf = cv_optimize(clf, parameters, Xtrain, ytrain)
clf=clf.fit(Xtrain, ytrain)
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)
print("Accuracy on training data: {:0.2f}".format(training_accuracy))
print("Accuracy on test data: {:0.2f}".format(test_accuracy))
return clf, Xtrain, ytrain, Xtest, ytest
In [20]:
h = lambda z: 1. / (1 + np.exp(-z))
zs=np.arange(-5, 5, 0.1)
plt.plot(zs, h(zs), alpha=0.5);
In [14]:
dflog.head()
Out[14]:
In [28]:
clf_l, Xtrain_l, ytrain_l, Xtest_l, ytest_l = do_classify(LogisticRegression(),
{"C": [0.01, 0.1, 1, 10, 100]},
dflog, ['Weight', 'Height'], 'Gender','Male')
In [22]:
plt.figure()
ax=plt.gca()
points_plot(ax, Xtrain_l, Xtest_l, ytrain_l, ytest_l, clf_l, alpha=0.2);
In [23]:
clf_l.predict_proba(Xtest_l)
Out[23]:
In [24]:
plt.figure()
ax = plt.gca()
points_plot_prob(ax, Xtrain_l, Xtest_l, ytrain_l, ytest_l, clf_l, psize=20, alpha=0.1);
In [ ]: