In [1]:
import xgboost as xgb, numpy as np, pandas as pd, csv
In [2]:
with open("D:/satisfy/input/train.csv", "rb") as trainFile:
lines = np.array(list(csv.reader(trainFile))[1:])
label = np.array(lines[:,-1]).astype(int)
trainData = np.array(lines[:,1:-1]).astype(float)
with open("D:/satisfy/input/test.csv", "r") as testFile:
lines = np.array(list(csv.reader(testFile))[1:])
testData = np.array(lines[:, 1:]).astype(float)
with open("D:/satisfy/input/sample_submission.csv", "r") as benchmarkFile:
lines = np.array(list(csv.reader(benchmarkFile))[1:])
benchmarklabel = np.array(lines[:, 1]).astype(int)
In [3]:
train = xgb.DMatrix( trainData, label=label)
test = xgb.DMatrix(testData, label=benchmarklabel)
In [4]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 100
param['silent'] = 0
param['nthread'] = 8
param['num_class'] = 2
In [5]:
watchlist = [ (train,'train'), (test, 'test') ]
num_round = 50
bst = xgb.train(param, train, num_round, watchlist )
# get prediction
pred = bst.predict( test )
In [6]:
pred
Out[6]:
In [7]:
sum(pred)
Out[7]:
In [8]:
np.savetxt("submission_xgboost_santander.csv", pred, delimiter = ",", fmt = "%f")
In [9]:
temp = bst.classify(test)
In [6]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 10000
param['silent'] = 0
param['nthread'] = 4
param['num_class'] = 2
In [7]:
watchlist = [ (train,'train'), (test, 'test') ]
num_round = 100
bst = xgb.train(param, train, num_round, watchlist )
# get prediction
pred = bst.predict( test )
In [8]:
pred
Out[8]:
In [9]:
np.savetxt("submission_xgboost_santander.csv", pred[:,1], delimiter = ",", fmt = "%f")
In [25]:
clf = xgb.XGBClassifier(missing=np.nan, max_depth=100, n_estimators=500, learning_rate=0.03, nthread=8, subsample=0.95, colsample_bytree=0.85, seed=7028)
In [26]:
clf.fit(trainData, label, eval_metric="auc")
Out[26]:
In [10]:
from sklearn.cross_validation import train_test_split
In [12]:
X_fit, X_eval, y_fit, y_eval= train_test_split(trainData, label, test_size=0.3)
In [16]:
clf.fit(trainData, label, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])
Out[16]:
In [30]:
print('Overall AUC:', roc_auc_score(label, clf.predict_proba(trainData)[:,1]))
In [18]:
from sklearn.metrics import roc_auc_score
In [31]:
# predicting
y_pred= clf.predict_proba(testData)[:,1]
submission = pd.DataFrame({"TARGET":y_pred})
submission.to_csv("submission_xgboost_84.csv", index=False)
In [ ]: