In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from skutil.feature_selection import MulticollinearityFilterer
# import skutil
import skutil
skutil.__version__
Out[1]:
In [2]:
iris = load_iris()
X = pd.DataFrame.from_records(data=iris.data, columns=iris.feature_names)
In [3]:
X.head()
Out[3]:
In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, iris.target, train_size=0.75)
In [5]:
from sklearn.pipeline import Pipeline
from skutil.preprocessing import BoxCoxTransformer, SelectiveScaler
from skutil.decomposition import SelectivePCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# build a pipeline
pipe = Pipeline([
('collinearity', MulticollinearityFilterer(threshold=0.85)),
('scaler' , SelectiveScaler()),
('boxcox' , BoxCoxTransformer()),
('pca' , SelectivePCA(n_components=0.9)),
('model' , RandomForestClassifier())
])
# fit the pipe, report scores
pipe.fit(X_train, y_train)
# report scores
print 'Train RF accuracy: %.5f' % accuracy_score(y_train, pipe.predict(X_train))
print 'Test RF accuracy: %.5f' % accuracy_score(y_test, pipe.predict(X_test))
The performance isn't bad. The training accuracy is phenomenal, but the validation accuracy is sub-par. Plus, there's quite of variance in the model, isn't there? Let's try to improve our performance as well as reduce the variability (while sacrificing some bias, unfortunately).
Beware, this grid can be a lot to handle for an older or weaker machine
In [9]:
from skutil.grid_search import RandomizedSearchCV
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from skutil.feature_selection import NearZeroVarianceFilterer
from scipy.stats import randint, uniform
# default CV does not shuffle, so we define our own
custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42)
# build a pipeline -- let's also add a NearZeroVarianceFilterer prior to PCA
pipe = Pipeline([
('collinearity', MulticollinearityFilterer(threshold=0.85)),
('scaler' , SelectiveScaler()),
('boxcox' , BoxCoxTransformer()),
('filterer' , NearZeroVarianceFilterer()),
('pca' , SelectivePCA(n_components=0.9)),
('model' , RandomForestClassifier(n_jobs=-1))
])
# let's define a set of hyper-parameters over which to search
hp = {
'collinearity__threshold' : uniform(loc=.8, scale=.15),
'collinearity__method' : ['pearson','kendall','spearman'],
'scaler__scaler' : [StandardScaler(), RobustScaler()],
'filterer__threshold' : uniform(loc=1e-6, scale=0.005),
'pca__n_components' : uniform(loc=.75, scale=.2),
'pca__whiten' : [True, False],
'model__n_estimators' : randint(5,100),
'model__max_depth' : randint(2,25),
'model__min_samples_leaf' : randint(1,15),
'model__max_features' : uniform(loc=.5, scale=.5),
'model__max_leaf_nodes' : randint(10,75)
}
# define the gridsearch
search = RandomizedSearchCV(pipe, hp,
n_iter=50,
scoring='accuracy',
cv=custom_cv,
random_state=42)
# fit the search
search.fit(X_train, y_train)
# report scores
print 'Train RF accuracy: %.5f' % accuracy_score(y_train, search.predict(X_train))
print 'Test RF accuracy: %.5f' % accuracy_score(y_test, search.predict(X_test))
This is much better! We've dramatically reduced the variance in our model, but we've taken a slight hit in terms of bias. With different models, or even creating an ensemble of different models (ensemble of ensembles?), we could probably create an even better score.
It's also important to note that we were relatively cavalier in our preprocessing... in a real world situation, you'd check each step and ensure how we're transforming our data makes sense.
Finally, note that the skutil grid search API differs slightly from the sklearn one... in sklearn, we can call search.best_estimator_.predict, however when using SelectiveMixin transformers, names may be internally altered by the grid search API for support with sklearn cross_validation. Thus, in skutil, use search.predict instead.
Here are the best parameters for the grid:
In [10]:
search.best_params_
Out[10]:
In [11]:
from sklearn.externals import joblib
# write the model
joblib.dump(search, 'final_model.pkl', compress=3)
Out[11]:
In [13]:
from __future__ import print_function
# load the model
final_model = joblib.load('final_model.pkl')
# load your data
# new_data = pd.read_csv('...')
# ... any other pre-processing you may have done outside of the pipeline
# here's our example data
new_data = X
# make predictions
predictions = final_model.predict(new_data)
# view the top few
print(predictions[:5])
# view the performance (we can do this because we have the ground truth)
print(accuracy_score(iris.target, predictions))
In [14]:
# disk cleanup for git
!rm final_model.pkl