notebook.community

Edit and run



In [1]:

    
import xgboost as xgb, numpy as np, pandas as pd, csv



In [2]:

    
with open("D:/satisfy/input/train.csv", "rb") as trainFile:
    lines = np.array(list(csv.reader(trainFile))[1:])
    label = np.array(lines[:,-1]).astype(int)
    trainData = np.array(lines[:,1:-1]).astype(float)

with open("D:/satisfy/input/test.csv", "r") as testFile:
    lines = np.array(list(csv.reader(testFile))[1:])
    testData = np.array(lines[:, 1:]).astype(float)

with open("D:/satisfy/input/sample_submission.csv", "r") as benchmarkFile:
    lines = np.array(list(csv.reader(benchmarkFile))[1:])
    benchmarklabel = np.array(lines[:, 1]).astype(int)



In [3]:

    
train = xgb.DMatrix( trainData, label=label)
test = xgb.DMatrix(testData, label=benchmarklabel)



In [4]:

    
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 100
param['silent'] = 0
param['nthread'] = 8
param['num_class'] = 2



In [5]:

    
watchlist = [ (train,'train'), (test, 'test') ]
num_round = 50
bst = xgb.train(param, train, num_round, watchlist )
# get prediction
pred = bst.predict( test )



In [6]:

    
pred









    Out[6]:





array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)



In [7]:

    
sum(pred)









    Out[7]:





639.0



In [8]:

    
np.savetxt("submission_xgboost_santander.csv", pred, delimiter = ",", fmt = "%f")



In [9]:

    
temp = bst.classify(test)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-0add593886d2> in <module>()
----> 1 temp = bst.classify(test)

AttributeError: 'Booster' object has no attribute 'classify'



In [6]:

    
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 10000
param['silent'] = 0
param['nthread'] = 4
param['num_class'] = 2



In [7]:

    
watchlist = [ (train,'train'), (test, 'test') ]
num_round = 100
bst = xgb.train(param, train, num_round, watchlist )
# get prediction
pred = bst.predict( test )



In [8]:

    
pred









    Out[8]:





array([[  9.94797468e-01,   5.20246569e-03],
       [  9.93119717e-01,   6.88032620e-03],
       [  9.99837637e-01,   1.62375101e-04],
       ..., 
       [  9.90352929e-01,   9.64711886e-03],
       [  9.46197212e-01,   5.38027249e-02],
       [  9.99875665e-01,   1.24295853e-04]], dtype=float32)



In [9]:

    
np.savetxt("submission_xgboost_santander.csv", pred[:,1], delimiter = ",", fmt = "%f")



In [25]:

    
clf = xgb.XGBClassifier(missing=np.nan, max_depth=100, n_estimators=500, learning_rate=0.03, nthread=8, subsample=0.95, colsample_bytree=0.85, seed=7028)



In [26]:

    
clf.fit(trainData, label, eval_metric="auc")









    Out[26]:





XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=100,
       min_child_weight=1, missing=None, n_estimators=500, nthread=8,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7028, silent=True, subsample=0.95)



In [10]:

    
from sklearn.cross_validation import train_test_split



In [12]:

    
X_fit, X_eval, y_fit, y_eval= train_test_split(trainData, label, test_size=0.3)



In [16]:

    
clf.fit(trainData, label, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])









    Out[16]:





XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)



In [30]:

    
print('Overall AUC:', roc_auc_score(label, clf.predict_proba(trainData)[:,1]))









    



('Overall AUC:', 0.99805417624441795)



In [18]:

    
from sklearn.metrics import roc_auc_score



In [31]:

    
# predicting
y_pred= clf.predict_proba(testData)[:,1]
submission = pd.DataFrame({"TARGET":y_pred})
submission.to_csv("submission_xgboost_84.csv", index=False)



In [ ]: