In [ ]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from random import randint
from gplearn.genetic import SymbolicRegressor, SymbolicTransformer
from sklearn.utils import check_random_state
train = pd.read_csv('./data/allstate/train.csv')
test = pd.read_csv('./data/allstate/test.csv')
test['loss'] = np.nan
joined = pd.concat([train, test])
def logregobj(preds, dtrain):
labels = dtrain.get_label()
con = 2
x =preds-labels
grad =con*x / (np.abs(x)+con)
hess =con**2 / (np.abs(x)+con)**2
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))
cat_feature = [n for n in joined.columns if n.startswith('cat')]
cont_feature = [n for n in joined.columns if n.startswith('cont')]
for column in cat_feature:
joined[column] = pd.factorize(joined[column].values, sort=True)[0]
train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]
shift = 202
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)
print("Old shape of X Train={}".format(X.shape))
gp = SymbolicTransformer(generations=20, population_size=2000,
hall_of_fame=100, n_components=20,
parsimony_coefficient=0.0005,
max_samples=0.9, verbose=1,
random_state=0, n_jobs=10)
gp.fit(X, y)
gp_features_train = gp.transform(X)
X = np.hstack((X, gp_features_train))
print("New shape of X Train={}".format(X.shape))
print("Old shape of X Test={}".format(X_test.shape))
gp_features_test = gp.transform(X_test)
X_test = np.hstack((X_test, gp_features_test))
print("New shape of X Test={}".format(X_test.shape))
#X = X.sample(frac=0.1)
#y = y .iloc[X.index.values]
In [ ]: