In [ ]:

    
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from random import randint
from gplearn.genetic import SymbolicRegressor, SymbolicTransformer
from sklearn.utils import check_random_state

train = pd.read_csv('./data/allstate/train.csv')
test = pd.read_csv('./data/allstate/test.csv')

test['loss'] = np.nan
joined = pd.concat([train, test])

def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))


cat_feature = [n for n in joined.columns if n.startswith('cat')]    
cont_feature = [n for n in joined.columns if n.startswith('cont')] 
             
               

for column in cat_feature:
    joined[column] = pd.factorize(joined[column].values, sort=True)[0]

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 202
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)

print("Old shape of X Train={}".format(X.shape))

gp = SymbolicTransformer(generations=20, population_size=2000,
                     hall_of_fame=100, n_components=20,                         
                     parsimony_coefficient=0.0005,
                     max_samples=0.9, verbose=1,
                     random_state=0, n_jobs=10)
gp.fit(X, y)
gp_features_train = gp.transform(X)        
X = np.hstack((X, gp_features_train))

print("New shape of X Train={}".format(X.shape)) 

print("Old shape of X Test={}".format(X_test.shape)) 
gp_features_test = gp.transform(X_test)        
X_test = np.hstack((X_test, gp_features_test))

print("New shape of X Test={}".format(X_test.shape))    

#X = X.sample(frac=0.1)
#y = y .iloc[X.index.values]









    



/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    



Old shape of X Train=(188318, 130)
    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left