In [2]:
# Binary Classification
from __future__ import print_function

from vowpalwabbit.sklearn_vw import VWClassifier

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

# get some data
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)

# split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)

# build vowpal wabbit model
model = VWClassifier()
model.fit(X_train, y_train)

# evaluate
print('training score: {}'.format(model.score(X_train, y_train)))
print('testing score: {}'.format(model.score(X_test, y_test)))


training score: 0.522125
testing score: 0.511

In [3]:
# Parameter Grid Search
# http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#example-model-selection-randomized-search-py

from operator import itemgetter
from time import time
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats.distributions import uniform

# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")
        
# use a full grid over all parameters
np.random.seed(0)
n_iter = 20
params = {"l2": uniform(0.0001, 0.01),
          "l": [0.01, 0.1, 1.0],
          "power_t": uniform()}

# run search
search = RandomizedSearchCV(VWClassifier(), param_distributions=params, n_iter=n_iter)
start = time()
search.fit(X, y)

print("Parameter search took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(search.grid_scores_)))
report(search.grid_scores_)


Parameter search took 53.03 seconds for 20 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.552 (std: 0.006)
Parameters: {'l': 0.01, 'l2': 0.006918202991034834, 'power_t': 0.359507900573786}

Model with rank: 2
Mean validation score: 0.541 (std: 0.012)
Parameters: {'l': 0.1, 'l2': 0.006767667154456677, 'power_t': 0.6706378696181594}

Model with rank: 3
Mean validation score: 0.541 (std: 0.009)
Parameters: {'l': 0.01, 'l2': 0.005318483217500717, 'power_t': 0.4146619399905236}


In [4]:
# evaluate
model = VWClassifier(loss_function='logistic', l=0.01, l2=0.1)
model.fit(X_train, y_train)

print('training score: {}'.format(model.score(X_train, y_train)))
print('testing score: {}'.format(model.score(X_test, y_test)))

# cleanup
del model


training score: 0.55475
testing score: 0.541

In [ ]:


In [5]:
# Linear Regression

from vowpalwabbit.sklearn_vw import VWRegressor
from sklearn import datasets

# Load the diabetes dataset
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

model = VWRegressor(l=100)
model.fit(X, y)

print('intercept: {}'.format(model.get_intercept()))
print('predictions: {}'.format(model.predict(X[:10])))
print('training R2 score: {}'.format(model.score(X, y)))


intercept: 146.3594207763672
predictions: [ 194.89273071   71.24694061  164.77400208  158.98957825  130.94725037
  104.06398773   71.54694366  123.39061737  144.63090515  206.62693787]
training R2 score: 0.5049730154443529

In [6]:
# Save the model and reload it
model.save('test.model')
del model
model = VWRegressor()
model.load('test.model')
print('intercept: {}'.format(model.get_intercept()))
print('predictions: {}'.format(model.predict(X[:10])))
print('training R2 score: {}'.format(model.score(X, y)))


intercept: 146.3594207763672
predictions: [ 194.89273071   71.24694061  164.77400208  158.98957825  130.94725037
  104.06398773   71.54694366  123.39061737  144.63090515  206.62693787]
training R2 score: 0.5049730154443529

In [ ]: