In [1]:
import xgboost as xgb, numpy as np, pandas as pd, csv

In [2]:
with open("D:/satisfy/input/train.csv", "rb") as trainFile:
    lines = np.array(list(csv.reader(trainFile))[1:])
    label = np.array(lines[:,-1]).astype(int)
    trainData = np.array(lines[:,1:-1]).astype(float)

with open("D:/satisfy/input/test.csv", "r") as testFile:
    lines = np.array(list(csv.reader(testFile))[1:])
    testData = np.array(lines[:, 1:]).astype(float)

with open("D:/satisfy/input/sample_submission.csv", "r") as benchmarkFile:
    lines = np.array(list(csv.reader(benchmarkFile))[1:])
    benchmarklabel = np.array(lines[:, 1]).astype(int)

In [3]:
train = xgb.DMatrix( trainData, label=label)
test = xgb.DMatrix(testData, label=benchmarklabel)

In [4]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 100
param['silent'] = 0
param['nthread'] = 8
param['num_class'] = 2

In [5]:
watchlist = [ (train,'train'), (test, 'test') ]
num_round = 50
bst = xgb.train(param, train, num_round, watchlist )
# get prediction
pred = bst.predict( test )

In [6]:
pred


Out[6]:
array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)

In [7]:
sum(pred)


Out[7]:
639.0

In [8]:
np.savetxt("submission_xgboost_santander.csv", pred, delimiter = ",", fmt = "%f")

In [9]:
temp = bst.classify(test)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-0add593886d2> in <module>()
----> 1 temp = bst.classify(test)

AttributeError: 'Booster' object has no attribute 'classify'

In [6]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 10000
param['silent'] = 0
param['nthread'] = 4
param['num_class'] = 2

In [7]:
watchlist = [ (train,'train'), (test, 'test') ]
num_round = 100
bst = xgb.train(param, train, num_round, watchlist )
# get prediction
pred = bst.predict( test )

In [8]:
pred


Out[8]:
array([[  9.94797468e-01,   5.20246569e-03],
       [  9.93119717e-01,   6.88032620e-03],
       [  9.99837637e-01,   1.62375101e-04],
       ..., 
       [  9.90352929e-01,   9.64711886e-03],
       [  9.46197212e-01,   5.38027249e-02],
       [  9.99875665e-01,   1.24295853e-04]], dtype=float32)

In [9]:
np.savetxt("submission_xgboost_santander.csv", pred[:,1], delimiter = ",", fmt = "%f")

In [25]:
clf = xgb.XGBClassifier(missing=np.nan, max_depth=100, n_estimators=500, learning_rate=0.03, nthread=8, subsample=0.95, colsample_bytree=0.85, seed=7028)

In [26]:
clf.fit(trainData, label, eval_metric="auc")


Out[26]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=100,
       min_child_weight=1, missing=None, n_estimators=500, nthread=8,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7028, silent=True, subsample=0.95)

In [10]:
from sklearn.cross_validation import train_test_split

In [12]:
X_fit, X_eval, y_fit, y_eval= train_test_split(trainData, label, test_size=0.3)

In [16]:
clf.fit(trainData, label, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])


Out[16]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)

In [30]:
print('Overall AUC:', roc_auc_score(label, clf.predict_proba(trainData)[:,1]))


('Overall AUC:', 0.99805417624441795)

In [18]:
from sklearn.metrics import roc_auc_score

In [31]:
# predicting
y_pred= clf.predict_proba(testData)[:,1]
submission = pd.DataFrame({"TARGET":y_pred})
submission.to_csv("submission_xgboost_84.csv", index=False)

In [ ]: