notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from random import randint
from gplearn.genetic import SymbolicRegressor, SymbolicTransformer
from sklearn.utils import check_random_state

train = pd.read_csv('./data/allstate/train.csv')
test = pd.read_csv('./data/allstate/test.csv')

test['loss'] = np.nan
joined = pd.concat([train, test])

def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))


cat_feature = [n for n in joined.columns if n.startswith('cat')]    
cont_feature = [n for n in joined.columns if n.startswith('cont')] 
             
               

for column in cat_feature:
    joined[column] = pd.factorize(joined[column].values, sort=True)[0]

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 202
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)

print("Old shape of X Train={}".format(X.shape))

gp = SymbolicTransformer(generations=5, population_size=100,
                     hall_of_fame=100, n_components=10,                         
                     parsimony_coefficient=0.0005,
                     max_samples=0.9, verbose=1,
                     random_state=0, n_jobs=10)
gp.fit(X, y)
gp_features_train = gp.transform(X)        
X = np.hstack((X, gp_features_train))

print("New shape of X Train={}".format(X.shape)) 

print("Old shape of X Test={}".format(X_test.shape)) 
gp_features_test = gp.transform(X_test)        
X_test = np.hstack((X_test, gp_features_test))

print("New shape of X Test={}".format(X_test.shape))    

#X = X.sample(frac=0.1)
#y = y .iloc[X.index.values]









    



/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    



Old shape of X Train=(188318, 130)
    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     10.3  0.0872788528636       12 0.40716130032998166   0.426128990436     18.73s
   1    10.51   0.254186590825        8 0.48745896871708355   0.490173482329     18.57s
   2    12.04    0.34583172643        8 0.4880700250142873   0.484602244015     13.39s
   3     9.32   0.381700294873       13 0.5263190018162145   0.527946454182      6.92s
   4    10.57   0.397828543125       13 0.5442510018216578   0.541585345789      0.00s
New shape of X Train=(188318, 140)
Old shape of X Test=(125546, 130)
New shape of X Test=(125546, 140)



In [ ]:

    
X = pd.DataFrame(X)
X_test = pd.DataFrame(X_test)

n_folds  = 6
kf = KFold(X.shape[0], n_folds=n_folds)
prediction = np.zeros(ids.shape)

final_fold_prediction= []
final_fold_real = []

partial_evalutaion = open('temp_scores.txt','w')
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i + 1))
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]    

    RANDOM_STATE = randint(1,429496)
    params = {
        'min_child_weight': 1,
        'eta': 0.001,
        'colsample_bytree': 0.5,
        'max_depth': 12,
        'subsample': 0.8,
        'alpha': 1,
        'gamma': 1,
        'silent': 1,
        'verbose_eval': False,
        'seed': RANDOM_STATE
    }

    xgtrain = xgb.DMatrix(X_train, label=y_train)
    xgtrain_2 = xgb.DMatrix(X_val, label=y_val)

    xgtest = xgb.DMatrix(X_test)
    watchlist = [(xgtrain, 'train'), (xgtrain_2, 'eval')]                    

    model = xgb.train(params, xgtrain, 100000, watchlist, obj=logregobj, feval=evalerror, 
    early_stopping_rounds=300, verbose_eval=100)        
    prediction += np.exp(model.predict(xgtest)) - shift

    X_val = xgb.DMatrix(X_val) 
    temp_serises = pd.Series(np.exp(model.predict(X_val))-shift)
    final_fold_prediction.append( temp_serises )
    temp_serises = np.exp(y_val) -shift
    final_fold_real.append(temp_serises )

    temp_cv_score = mean_absolute_error(np.exp(model.predict(X_val))-shift, np.exp(y_val) -shift)

    partial_evalutaion.write('fold '+str(i)+' '+str(temp_cv_score)+'\n')
    partial_evalutaion.flush()

prediction = prediction/n_folds
submission = pd.DataFrame()
submission['id'] = ids    
submission['loss'] = prediction

submission.to_csv('sub_gp.csv', index=False)

final_fold_prediction = pd.concat(final_fold_prediction,ignore_index=True)
final_fold_real = pd.concat(final_fold_real,ignore_index=True)

cv_score = mean_absolute_error(final_fold_prediction, final_fold_real)
print ("CV Score={}".format(cv_score))









    



 Fold 1
[0]	train-mae:3237.39	eval-mae:3238.84
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[100]	train-mae:3220.64	eval-mae:3222.07



In [ ]: