In [2]:
import os
import pandas as pd
import numpy as np
from stackregression import stack_regression_step1, stack_regression_step2, print_prediction_report
from utils import encode_numeric_zscore_list, encode_numeric_zscore_all, to_xy, encode_text_index_list, encode_numeric_log_all
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.datasets import dump_svmlight_file
from scipy.sparse import csr_matrix, hstack
from vowpalwabbit.sklearn_vw import VWRegressor
from random import randint
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error
from utils import get_allstate_train_valid_test_testids
from operator import itemgetter
from time import time
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats.distributions import uniform

In [3]:
shift=200
train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)


Train shape is: (188318, 132)
Test shape is: (125546, 131)
/home/arvc/t81_558_deep_learning/utils.py:139: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train.drop("type", axis=1, inplace=True)
/home/arvc/t81_558_deep_learning/utils.py:140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test.drop("type", axis=1, inplace=True)
Final Train shape is: (160070, 131)
Final Valid shape is: (28248, 131)
Final Test shape is: (125546, 131)

In [4]:
y_train = train["loss"]
x_train = train.drop("loss", axis=1)

In [5]:
y_valid = valid["loss"]
x_valid = valid.drop("loss", axis=1)

In [14]:
VWRegressor?

In [ ]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")
        
# use a full grid over all parameters
np.random.seed(0)
n_iter = 200
params = {"l2": uniform(0.0001, 0.01),
          "l": [0.01, 0.1, 1.0],
          "power_t": uniform(),
          "passes": [5,10,15,20,25,30,35,40]}

# run search
search = RandomizedSearchCV(VWRegressor(), param_distributions=params, n_iter=n_iter)
start = time()
search.fit(x_train, y_train)

print("Parameter search took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(search.grid_scores_)))
report(search.grid_scores_)

# build model
#model = VWRegressor(passes=200 , quiet=False, learning_rate=0.1, audit=False, progress=0.5, permutations=True)
#model.fit(x_train, y_train)

# evaluate model
#model.score(x_train, y_train)
#model.score(x_valid, y_valid)

In [12]:
predictions = np.exp(model.predict(x_valid)) - shift
score = mean_absolute_error(y_valid, predictions)
print("\tMAE {0}\n\n".format(score))


	MAE 2681.2398237221023



In [ ]: