In [ ]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from random import randint
from gplearn.genetic import SymbolicRegressor

train = pd.read_csv('./data/allstate/train.csv')
test = pd.read_csv('./data/allstate/test.csv')

test['loss'] = np.nan
joined = pd.concat([train, test])

cat_feature = [n for n in joined.columns if n.startswith('cat')]    
cont_feature = [n for n in joined.columns if n.startswith('cont')]                     
 
for column in cat_feature:
    joined[column] = pd.factorize(joined[column].values, sort=True)[0]

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 202
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)

#X = X.sample(frac=0.1)
#y = y .iloc[X.index.values]

n_folds  = 5
kf = KFold(X.shape[0], n_folds=n_folds)
prediction = np.zeros(ids.shape)

final_fold_prediction= []
final_fold_real = []

for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i + 1))
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]                           

    model = SymbolicRegressor(population_size=500, generations=200, stopping_criteria=0.001,
                             p_crossover=0.7,p_subtree_mutation=0.1, 
                             p_hoist_mutation=0.05, p_point_mutation=0.1,
                             max_samples=0.9, verbose=1, parsimony_coefficient="auto", random_state=0
                             )
    model.fit(X_train, y_train)  
    prediction += np.exp(model.predict(X_test)) - shift

prediction = prediction/n_folds
submission = pd.DataFrame()
submission['id'] = ids    
submission['loss'] = prediction

submission.to_csv('./data/allstate/sub_genetic_5folds.csv', index=False)


/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
 Fold 1
    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    10.51    7.89714653549        4    1.96136132291    1.94790771869     67.90m
   1    11.21    6.37639538706        7    1.20438451199    1.19706575119     66.10m

In [ ]: