In [2]:
# Binary Classification
from __future__ import print_function
from vowpalwabbit.sklearn_vw import VWClassifier
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
# get some data
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)
# split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)
# build vowpal wabbit model
model = VWClassifier()
model.fit(X_train, y_train)
# evaluate
print('training score: {}'.format(model.score(X_train, y_train)))
print('testing score: {}'.format(model.score(X_test, y_test)))
In [3]:
# Parameter Grid Search
# http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#example-model-selection-randomized-search-py
from operator import itemgetter
from time import time
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats.distributions import uniform
# Utility function to report best scores
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
# use a full grid over all parameters
np.random.seed(0)
n_iter = 20
params = {"l2": uniform(0.0001, 0.01),
"l": [0.01, 0.1, 1.0],
"power_t": uniform()}
# run search
search = RandomizedSearchCV(VWClassifier(), param_distributions=params, n_iter=n_iter)
start = time()
search.fit(X, y)
print("Parameter search took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(search.grid_scores_)))
report(search.grid_scores_)
In [4]:
# evaluate
model = VWClassifier(loss_function='logistic', l=0.01, l2=0.1)
model.fit(X_train, y_train)
print('training score: {}'.format(model.score(X_train, y_train)))
print('testing score: {}'.format(model.score(X_test, y_test)))
# cleanup
del model
In [ ]:
In [5]:
# Linear Regression
from vowpalwabbit.sklearn_vw import VWRegressor
from sklearn import datasets
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
model = VWRegressor(l=100)
model.fit(X, y)
print('intercept: {}'.format(model.get_intercept()))
print('predictions: {}'.format(model.predict(X[:10])))
print('training R2 score: {}'.format(model.score(X, y)))
In [6]:
# Save the model and reload it
model.save('test.model')
del model
model = VWRegressor()
model.load('test.model')
print('intercept: {}'.format(model.get_intercept()))
print('predictions: {}'.format(model.predict(X[:10])))
print('training R2 score: {}'.format(model.score(X, y)))
In [ ]: