In [ ]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from random import randint
from gplearn.genetic import SymbolicRegressor
train = pd.read_csv('./data/allstate/train.csv')
test = pd.read_csv('./data/allstate/test.csv')
test['loss'] = np.nan
joined = pd.concat([train, test])
cat_feature = [n for n in joined.columns if n.startswith('cat')]
cont_feature = [n for n in joined.columns if n.startswith('cont')]
for column in cat_feature:
joined[column] = pd.factorize(joined[column].values, sort=True)[0]
train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]
shift = 202
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)
#X = X.sample(frac=0.1)
#y = y .iloc[X.index.values]
n_folds = 5
kf = KFold(X.shape[0], n_folds=n_folds)
prediction = np.zeros(ids.shape)
final_fold_prediction= []
final_fold_real = []
for i, (train_index, test_index) in enumerate(kf):
print('\n Fold %d' % (i + 1))
X_train, X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
model = SymbolicRegressor(population_size=500, generations=200, stopping_criteria=0.001,
p_crossover=0.7,p_subtree_mutation=0.1,
p_hoist_mutation=0.05, p_point_mutation=0.1,
max_samples=0.9, verbose=1, parsimony_coefficient="auto", random_state=0
)
model.fit(X_train, y_train)
prediction += np.exp(model.predict(X_test)) - shift
prediction = prediction/n_folds
submission = pd.DataFrame()
submission['id'] = ids
submission['loss'] = prediction
submission.to_csv('./data/allstate/sub_genetic_5folds.csv', index=False)
In [ ]: