In [ ]:
import numpy as np
%pylab inline
In [ ]:
# Load the data as usual (here the code for Python 2.7)
X = np.loadtxt('data/small_Endometrium_Uterus.csv', delimiter=',', skiprows=1, usecols=range(1, 3001))
y = np.loadtxt('data/small_Endometrium_Uterus.csv', delimiter=',', skiprows=1, usecols=[3001],
converters={3001: lambda s: 0 if s=='Endometrium' else 1}, dtype='int')
In [ ]:
# Set up a stratified 10-fold cross-validation
from sklearn import cross_validation
folds = cross_validation.StratifiedKFold(y, 10, shuffle=True)
In [ ]:
# Create a function that does cross-validation and scales the features on each training set.
from sklearn import preprocessing
def cross_validate_with_scaling(design_matrix, labels, classifier, cv_folds):
""" Perform a cross-validation and returns the predictions.
Use a scaler to scale the features to mean 0, standard deviation 1.
Parameters:
-----------
design_matrix: (n_samples, n_features) np.array
Design matrix for the experiment.
labels: (n_samples, ) np.array
Vector of labels.
classifier: sklearn classifier object
Classifier instance; must have the following methods:
- fit(X, y) to train the classifier on the data X, y
- predict_proba(X) to apply the trained classifier to the data X and return probability estimates
cv_folds: sklearn cross-validation object
Cross-validation iterator.
Return:
-------
pred: (n_samples, ) np.array
Vectors of predictions (same order as labels).
"""
pred = np.zeros(labels.shape) # vector of 0 in which to store the predictions
for tr, te in cv_folds:
# Restrict data to train/test folds
Xtr = design_matrix[tr, :]
ytr = labels[tr]
Xte = design_matrix[te, :]
#print Xtr.shape, ytr.shape, Xte.shape
# Scale data
scaler = preprocessing.StandardScaler() # create scaler
Xtr = scaler.fit_transform(Xtr) # fit the scaler to the training data and transform training data
Xte = scaler.transform(Xte) # transform test data
# Fit classifier
classifier.fit(Xtr, ytr)
# Predict probabilities (of belonging to +1 class) on test data
yte_pred = classifier.predict_proba(Xte) # two-dimensional array
# Identify the index, in yte_pred, of the positive class (y=1)
# index_of_class_1 = np.nonzero(classifier.classes_ == 1)[0][0]
index_of_class_1 = 1 - ytr[0] # 0 if the first sample is positive, 1 otherwise
pred[te] = yte_pred[:, index_of_class_1]
return pred
In [ ]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(penalty='l1')
Question Compute the cross-validated predictions of the l1-regularized logistic regression with default parameters on our data.
In [ ]:
Question Plot the corresponding ROC curve, and compare it to that obtained for non-regularized logistic regression.
In [ ]:
What does the C parameter correspond to? See the documentation at http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression for help.
Scikit-learn makes it really easy to use a nested cross-validation to choose a good value for C among a grid of several choices.
In [ ]:
from sklearn import grid_search
param_grid = {'C':[1e-3, 1e-2, 1e-1, 1., 1e2, 1e3]}
clf = grid_search.GridSearchCV(linear_model.LogisticRegression(penalty='l1'), param_grid)
Question What criterion is used to chose the optimal C? See the documentation at http://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV. Try changing this criterion http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
In [ ]:
Question Compute the cross-validated predictions of the l1-regularized logistic regression with optimized C parameter on our data.
In [ ]:
GridSearchCV also uses the optimal parameter(s) it detected to fit a model to its entire training data again, generating a "best model" that is accessible via the best_estimator_
attribute.
In our case, because we called GridSearchCV from inside a cross-validation loop, clf.best_estimator_
is the "best model" on the last training fold.
In [ ]:
print clf.best_estimator_
Question Plot the corresponding ROC curve, and compare to that obtained for
In [ ]:
In [ ]:
# This code plots the regression weights of the classifier 'clf'
plt.plot(range(len(clf.best_estimator_.coef_[0])), clf.best_estimator_.coef_[0],
color='blue', marker='+', linestyle='')
plt.xlabel('Genes', fontsize=16)
plt.ylabel('Weights', fontsize=16)
plt.title('Logistic regression weights', fontsize=16)
plt.xlim([0, X.shape[1]])
Question Compare the regression weights obtained with and without l1-regularization, in two side-by-side plots.
In [ ]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(121) # use a 1x2 subplot grid; ax will refer to the 1st subplot
number_of_weights = #TODO
logreg_weights = #TODO
ax.plot(range(number_of_weights), logreg_weights,
color='blue', marker='+', linestyle='')
ax.set_xlabel('Genes', fontsize=16)
ax.set_ylabel('Weights', fontsize=16)
ax.set_title('Logistic regression weights', fontsize=16)
ax.set_xlim([0, X.shape[1]])
ax = fig.add_subplot(122) # use a 1x2 subplot grid; ax will refer to the 2nd subplot
l1_logreg_weights = #TODO
ax.plot(ange(number_of_weights), l1_logreg_weights,
color='blue', marker='+', linestyle='')
ax.set_xlabel('Genes', fontsize=16)
ax.set_ylabel('Weights', fontsize=16)
ax.set_title('Regularized Logistic regression weights', fontsize=16)
ax.set_xlim([0, X.shape[1]])
plt.tight_layout()
In [ ]:
clf = grid_search.GridSearchCV(linear_model.LogisticRegression(penalty='l2'), param_grid)
Question Compute the cross-validated predictions of an l2-regularized logistic regression with optimized C parameters on our data.
In [ ]:
Question Plot the corresponding ROC curve, and compare to that obtained for
In [ ]:
Question Compare the regression weights obtained with l2-regularization to those obtained
In [ ]:
In [ ]: