In [2]:
import os
import pandas as pd
import numpy as np
from stackregression import stack_regression_step1, stack_regression_step2, print_prediction_report
from utils import encode_numeric_zscore_list, encode_numeric_zscore_all, to_xy, encode_text_index_list, encode_numeric_log_all
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.datasets import dump_svmlight_file
from scipy.sparse import csr_matrix, hstack
from vowpalwabbit.sklearn_vw import VWRegressor
from random import randint
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error
from utils import get_allstate_train_valid_test_testids
from operator import itemgetter
from time import time
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats.distributions import uniform
In [3]:
shift=200
train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
In [4]:
y_train = train["loss"]
x_train = train.drop("loss", axis=1)
In [5]:
y_valid = valid["loss"]
x_valid = valid.drop("loss", axis=1)
In [14]:
VWRegressor?
In [ ]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
# use a full grid over all parameters
np.random.seed(0)
n_iter = 200
params = {"l2": uniform(0.0001, 0.01),
"l": [0.01, 0.1, 1.0],
"power_t": uniform(),
"passes": [5,10,15,20,25,30,35,40]}
# run search
search = RandomizedSearchCV(VWRegressor(), param_distributions=params, n_iter=n_iter)
start = time()
search.fit(x_train, y_train)
print("Parameter search took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(search.grid_scores_)))
report(search.grid_scores_)
# build model
#model = VWRegressor(passes=200 , quiet=False, learning_rate=0.1, audit=False, progress=0.5, permutations=True)
#model.fit(x_train, y_train)
# evaluate model
#model.score(x_train, y_train)
#model.score(x_valid, y_valid)
In [12]:
predictions = np.exp(model.predict(x_valid)) - shift
score = mean_absolute_error(y_valid, predictions)
print("\tMAE {0}\n\n".format(score))
In [ ]: