In [7]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_curve, auc
import pandas as pd
import time
from scipy import interp
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.manifold import TSNE
In [3]:
estimators = [('reduce_dim', PCA()),
('clf', SVC())]
pipe = Pipeline(estimators)
pipe
Out[3]:
In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Binarizer
make_pipeline(Binarizer(), MultinomialNB())
Out[4]:
In [6]:
pipe.steps[0]
Out[6]:
In [7]:
pipe.named_steps['reduce_dim']
Out[7]:
In [10]:
pipe.set_params(clf__C=10)
Out[10]:
In [11]:
pipe.named_steps.reduce_dim is pipe.named_steps['reduce_dim']
Out[11]:
In [90]:
from sklearn.model_selection import GridSearchCV
param_grid = dict( reduce_dim__n_components=[2,5,10],)
clf__C=[0.1,10,100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)
In [91]:
grid_search
Out[91]:
Run grid search to find params for dimensionality reduction, then do a logistic regression
In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
In [18]:
pipe = Pipeline([
('reduce_dim', PCA()),
('classify', LinearSVC())
])
N_FEATURES_OPTIONS = [2,4,8]
C_OPTIONS = [1,10,100,1000]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
{
'reduce_dim': [SelectKBest(chi2)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
}
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)
Out[18]:
In [19]:
mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
print(mean_scores)
In [22]:
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
(len(reducer_labels) + 1) + .5)
plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Digit classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
plt.show()
In [23]:
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
In [24]:
#Let's make a learning curve
# invent a classification problem
X, y = samples_generator.make_classification(
n_features=100, n_informative=5, n_redundant=10, n_classes=4,
n_clusters_per_class=2)
In [25]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
digits = load_digits()
X, y = digits.data, digits.target
title = 'Learning Curves (Naive Bayes)'
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01),
cv=cv, n_jobs=4)
Out[25]:
In [55]:
X, y = samples_generator.make_classification(
n_features=1000, n_informative=10, n_redundant=10, n_classes=4,
n_clusters_per_class=1,
n_samples=100)
print(X.shape)
cv = ShuffleSplit(n_splits=50, test_size=0.05, random_state=0)
my_plot = plot_learning_curve(estimator, title, X, y,
cv=cv, n_jobs=4)
my_plot.show()
In [80]:
X_toy = np.array([[1,2,3],
[10, 50, 15],
[100, 500, 150]])
a = np.array([])
print X_toy
print np.mean(X_toy, axis=0)
In [9]:
def pqn_normalize(X):
feature_means = np.mean(X, axis=0)
# mean-center each feature
X_mean_centered = np.divide(X, feature_means)
dilution_factors = np.median(X_mean_centered, axis=1)
# broadcast correctly
X_pqn = np.divide(X, dilution_factors[:,np.newaxis] )
return X_pqn
pqn_normalize(X_toy)
In [2]:
X, y = samples_generator.make_classification(
n_features=100, n_informative=10, n_redundant=10, n_classes=2,
n_clusters_per_class=1,
n_samples=20)
pipe = Pipeline([
('pqn_normalize', FunctionTransformer(pqn_normalize)),
('rf_clf', RandomForestClassifier())
])
param_grid = dict(rf_clf__n_estimators = np.int_(
np.round(
np.linspace(50, 2000, 10)
))
)
n_iter = 5
test_size = 0.3
random_state = 1
cv = StratifiedShuffleSplit(n_splits=n_iter, test_size=test_size, random_state=random_state)
#param_grid = {rf_clf__n_estimators=stats.uniform()}
grid_search = GridSearchCV(pipe, param_grid=param_grid,
scoring='roc_auc',
cv=cv)
In [127]:
np.int_(np.array([10.2, 11.5]))
Out[127]:
In [166]:
print y
grid_search.fit(X, y)
Out[166]:
In [168]:
pd.DataFrame(grid_search.cv_results_)
Out[168]:
In [172]:
# Now plot a learning curve with given parameters
cv_learn_curve = StratifiedShuffleSplit(
n_splits=25, test_size=0.3, random_state=random_state)
my_plt = plot_learning_curve(grid_search.best_estimator_,
'Learning curve (RF) Chosen vis gridsearchCV',
X, y, cv=cv, n_jobs=4)
my_plt.show()
In [186]:
# Get an ROC curve plot from the best estimator
def roc_curve_cv(X, y, clf, cross_val,
path='/home/irockafe/Desktop/roc.pdf',
save=False, plot=True):
t1 = time.time()
# collect vals for the ROC curves
tpr_list = []
mean_fpr = np.linspace(0,1,100)
auc_list = []
# Get the false-positive and true-positive rate
for i, (train, test) in enumerate(cross_val.split(X,y)):
clf.fit(X[train], y[train])
y_pred = clf.predict_proba(X[test])[:,1]
# get fpr, tpr
fpr, tpr, thresholds = roc_curve(y[test], y_pred)
roc_auc = auc(fpr, tpr)
#print 'AUC', roc_auc
#sns.plt.plot(fpr, tpr, lw=10, alpha=0.6, label='ROC - AUC = %0.2f' % roc_auc,)
#sns.plt.show()
tpr_list.append(interp(mean_fpr, fpr, tpr))
tpr_list[-1][0] = 0.0
auc_list.append(roc_auc)
if (i % 10 == 0):
print '{perc}% done! {time}s elapsed'.format(perc=100*float(i)/cross_val.n_splits,
time=(time.time() - t1))
# get mean tpr and fpr
mean_tpr = np.mean(tpr_list, axis=0)
# make sure it ends up at 1.0
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(auc_list)
if plot:
# plot mean auc
plt.plot(mean_fpr, mean_tpr, label='Mean ROC - AUC = %0.2f $\pm$ %0.2f' % (mean_auc,
std_auc),
lw=5, color='b')
# plot luck-line
plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
label='Luck', alpha=0.5)
# plot 1-std
std_tpr = np.std(tpr_list, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2,
label=r'$\pm$ 1 stdev')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve, {iters} iterations of {cv} cross validation'.format(
iters=cross_val.n_splits, cv='{train}:{test}'.format(test=cross_val.test_size, train=(1-cross_val.test_size)))
)
plt.legend(loc="lower right")
if save:
plt.savefig(path, format='pdf')
#plt.show()
return plt, tpr_list, auc_list, mean_fpr
roc_cv = StratifiedShuffleSplit(
n_splits=5, test_size=0.3, random_state=random_state)
print roc_cv
plt, tpr_list, auc_list, mean_fpr = roc_curve_cv(X, y, grid_search.best_estimator_, cross_val=roc_cv)
In [197]:
def rf_gridsearch(X, y):
# TODO How to require pqn_normalize to be a function accessible here?
# Make your gridsearch pipeline
pipe = Pipeline([
('pqn_normalize', FunctionTransformer(pqn_normalize)),
('rf_clf', RandomForestClassifier())
])
#param_grid = dict(rf_clf__n_estimators = np.int_(
np.round(
np.linspace(50, 2000, 10)
))
)
n_splits = 15
test_size = 0.3
random_state = 1
cv = StratifiedShuffleSplit(n_splits=n_splits,
test_size=test_size,
random_state=random_state)
#param_grid = {rf_clf__n_estimators=stats.uniform()}
grid_search = GridSearchCV(pipe, param_grid=param_grid,
scoring='roc_auc',
cv=cv,
n_jobs=4)
grid_search.fit(X, y)
return grid_search
grid = rf_gridsearch(X, y)
In [199]:
pd.DataFrame(grid.cv_results_)
Out[199]:
In [10]:
X, y = samples_generator.make_classification(
n_features=100, n_informative=10, n_redundant=10, n_classes=2,
n_clusters_per_class=1,
n_samples=20)
pipe = Pipeline([
('pqn_normalize', FunctionTransformer(pqn_normalize)),
('pca', PCA()),
('tsne', TSNE()),
('rf_clf', RandomForestClassifier())
])
param_grid = dict(rf_clf__n_estimators = np.int_(
np.round(
np.linspace(50, 2000, 10))),
pca__n_components = [10, 25, 50, 75, 100],
)
n_iter = 5
test_size = 0.3
random_state = 1
cv = StratifiedShuffleSplit(n_splits=n_iter, test_size=test_size, random_state=random_state)
#param_grid = {rf_clf__n_estimators=stats.uniform()}
grid_search = GridSearchCV(pipe, param_grid=param_grid,
scoring='roc_auc',
cv=cv)
In [53]:
X, y = samples_generator.make_classification(
n_features=100, n_informative=90, n_redundant=10, n_classes=3,
n_clusters_per_class=1,
n_samples=60)
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)
print('PCA shape', X_pca.shape, np.sum(pca.explained_variance_ratio_))
tsne = TSNE()
X_pca_tsne = tsne.fit_transform(X_pca)
print('tsne shape', X_pca_tsne.shape)
def plt_scatter(X, y, title):
plt.scatter(x=[i[0] for i in X],
y=[i[1] for i in X],
c=y)
plt.title(title)
plt.show()
plt_scatter(X_pca, y, 'pca')
plt_scatter(X_pca_tsne, y, 'Tsne(pca)')
In [42]: