In [1]:
import xgboost as xgb, numpy as np, pandas as pd, csv

In [2]:
trainData = pd.read_csv("D:/satisfy/input/train.csv")
testData = pd.read_csv("D:/satisfy/input/test.csv")

In [3]:
label = trainData['TARGET']

In [4]:
np.unique(label)


Out[4]:
array([0, 1], dtype=int64)

In [5]:
trainData = trainData.drop(["ID", "TARGET"], 1)

In [6]:
idLine = testData["ID"]

In [7]:
testData = testData.drop("ID", 1)

In [13]:
Droper = []
for col in trainData.columns.values:
    if (len(np.unique(trainData[col])) == 1):
        print "Dropped constant in training data:", col
        Droper.append(col)
trainData = trainData.drop(Droper, 1)


Dropped constant in training data: ind_var2_0
Dropped constant in training data: ind_var2
Dropped constant in training data: ind_var27_0
Dropped constant in training data: ind_var28_0
Dropped constant in training data: ind_var28
Dropped constant in training data: ind_var27
Dropped constant in training data: ind_var41
Dropped constant in training data: ind_var46_0
Dropped constant in training data: ind_var46
Dropped constant in training data: num_var27_0
Dropped constant in training data: num_var28_0
Dropped constant in training data: num_var28
Dropped constant in training data: num_var27
Dropped constant in training data: num_var41
Dropped constant in training data: num_var46_0
Dropped constant in training data: num_var46
Dropped constant in training data: saldo_var28
Dropped constant in training data: saldo_var27
Dropped constant in training data: saldo_var41
Dropped constant in training data: saldo_var46
Dropped constant in training data: imp_amort_var18_hace3
Dropped constant in training data: imp_amort_var34_hace3
Dropped constant in training data: imp_reemb_var13_hace3
Dropped constant in training data: imp_reemb_var33_hace3
Dropped constant in training data: imp_trasp_var17_out_hace3
Dropped constant in training data: imp_trasp_var33_out_hace3
Dropped constant in training data: num_var2_0_ult1
Dropped constant in training data: num_var2_ult1
Dropped constant in training data: num_reemb_var13_hace3
Dropped constant in training data: num_reemb_var33_hace3
Dropped constant in training data: num_trasp_var17_out_hace3
Dropped constant in training data: num_trasp_var33_out_hace3
Dropped constant in training data: saldo_var2_ult1
Dropped constant in training data: saldo_medio_var13_medio_hace3

In [14]:
testData = testData.drop(Droper, 1)

In [15]:
Droper = []
for col in testData.columns.values:
    if (len(np.unique(testData[col])) == 1):
        print "Dropped constant in test data:", col
        Droper.append(col)
trainData = trainData.drop(Droper, 1)
testData = testData.drop(Droper, 1)


Dropped constant in test data: delta_imp_reemb_var33_1y3
Dropped constant in test data: delta_imp_trasp_var17_out_1y3
Dropped constant in test data: delta_num_reemb_var33_1y3
Dropped constant in test data: delta_num_trasp_var17_out_1y3
Dropped constant in test data: imp_reemb_var17_hace3
Dropped constant in test data: imp_reemb_var33_ult1
Dropped constant in test data: imp_trasp_var17_out_ult1
Dropped constant in test data: num_reemb_var17_hace3
Dropped constant in test data: num_reemb_var33_ult1
Dropped constant in test data: num_trasp_var17_out_ult1
Dropped constant in test data: saldo_medio_var29_hace3

In [46]:
Droper = []
colList = []
for col in trainData.columns.values:
    for precol in colList:
        if (abs(np.corrcoef(trainData[precol], trainData[col])[0,1]) > 0.999):
            Droper.append(col)
            print "Feature %s is highly correlated with another feature, dropped" % col
            break
    colList.append(col)


Feature imp_op_var39_efect_ult3 is highly correlated with another feature, dropped
Feature ind_var13_medio is highly correlated with another feature, dropped
Feature ind_var18 is highly correlated with another feature, dropped
Feature ind_var26 is highly correlated with another feature, dropped
Feature ind_var25 is highly correlated with another feature, dropped
Feature ind_var29_0 is highly correlated with another feature, dropped
Feature ind_var29 is highly correlated with another feature, dropped
Feature ind_var32 is highly correlated with another feature, dropped
Feature ind_var34 is highly correlated with another feature, dropped
Feature ind_var37 is highly correlated with another feature, dropped
Feature ind_var39 is highly correlated with another feature, dropped
Feature num_var6_0 is highly correlated with another feature, dropped
Feature num_var6 is highly correlated with another feature, dropped
Feature num_var8_0 is highly correlated with another feature, dropped
Feature num_var8 is highly correlated with another feature, dropped
Feature num_var13_corto is highly correlated with another feature, dropped
Feature num_var13_medio_0 is highly correlated with another feature, dropped
Feature num_var13_medio is highly correlated with another feature, dropped
Feature num_var18_0 is highly correlated with another feature, dropped
Feature num_var18 is highly correlated with another feature, dropped
Feature num_var20_0 is highly correlated with another feature, dropped
Feature num_var20 is highly correlated with another feature, dropped
Feature num_var24 is highly correlated with another feature, dropped
Feature num_var26 is highly correlated with another feature, dropped
Feature num_var25 is highly correlated with another feature, dropped
Feature num_var29_0 is highly correlated with another feature, dropped
Feature num_var29 is highly correlated with another feature, dropped
Feature num_var32 is highly correlated with another feature, dropped
Feature num_var34_0 is highly correlated with another feature, dropped
Feature num_var34 is highly correlated with another feature, dropped
Feature num_var37 is highly correlated with another feature, dropped
Feature num_var40_0 is highly correlated with another feature, dropped
Feature num_var40 is highly correlated with another feature, dropped
Feature num_var39 is highly correlated with another feature, dropped
Feature num_var44 is highly correlated with another feature, dropped
Feature saldo_var18 is highly correlated with another feature, dropped
Feature saldo_var29 is highly correlated with another feature, dropped
Feature delta_imp_amort_var18_1y3 is highly correlated with another feature, dropped
Feature delta_imp_amort_var34_1y3 is highly correlated with another feature, dropped
Feature delta_num_aport_var13_1y3 is highly correlated with another feature, dropped
Feature delta_num_aport_var17_1y3 is highly correlated with another feature, dropped
Feature delta_num_aport_var33_1y3 is highly correlated with another feature, dropped
Feature delta_num_compra_var44_1y3 is highly correlated with another feature, dropped
Feature delta_num_reemb_var13_1y3 is highly correlated with another feature, dropped
Feature delta_num_reemb_var17_1y3 is highly correlated with another feature, dropped
Feature delta_num_trasp_var17_in_1y3 is highly correlated with another feature, dropped
Feature delta_num_trasp_var33_in_1y3 is highly correlated with another feature, dropped
Feature delta_num_trasp_var33_out_1y3 is highly correlated with another feature, dropped
Feature delta_num_venta_var44_1y3 is highly correlated with another feature, dropped
Feature imp_amort_var18_ult1 is highly correlated with another feature, dropped
Feature imp_trasp_var33_out_ult1 is highly correlated with another feature, dropped
Feature num_var7_emit_ult1 is highly correlated with another feature, dropped
Feature num_meses_var13_medio_ult3 is highly correlated with another feature, dropped
Feature num_op_var39_efect_ult3 is highly correlated with another feature, dropped
Feature num_reemb_var13_ult1 is highly correlated with another feature, dropped
Feature num_trasp_var17_in_hace3 is highly correlated with another feature, dropped
Feature num_trasp_var17_in_ult1 is highly correlated with another feature, dropped
Feature num_trasp_var33_out_ult1 is highly correlated with another feature, dropped
Feature saldo_medio_var13_medio_ult1 is highly correlated with another feature, dropped

In [48]:
trainData = trainData.drop(Droper, 1)
testData = testData.drop(Droper, 1)

In [49]:
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split

In [50]:
X_fit, X_eval, y_fit, y_eval= train_test_split(trainData, label, test_size=0.3)

In [51]:
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, learning_rate=0.03, nthread=8,
                        subsample=0.95, colsample_bytree=0.85, seed=4242)

In [53]:
clf.fit(trainData, label, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])


Out[53]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=8,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)

In [54]:
print('Overall AUC:', roc_auc_score(label, clf.predict_proba(trainData)[:,1]))


('Overall AUC:', 0.88274809560232592)

In [56]:
y_pred= clf.predict_proba(testData)[:,1]
submission = pd.DataFrame({"ID":idLine, "TARGET":y_pred})
submission.to_csv("submission_xgboost_84_remove.csv", index=False)

In [ ]: