In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
In [2]:
# Import the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
train.head()
Out[3]:
In [4]:
train.dtypes
Out[4]:
In [5]:
for fea in [ 'Pclass', 'Sex', 'SibSp',
'Parch', 'Embarked']:
print("feature:%s" % fea)
print(train[fea].value_counts(dropna=False))
print()
In [6]:
# delete 'Name', 'Ticket', 'Cabin '
train.drop(['Name', 'Cabin', 'Ticket', 'Embarked'], axis=1, inplace=True)
In [7]:
train.head()
Out[7]:
In [8]:
# train['Embarked'].fillna('S', inplace=True)
In [9]:
train = pd.get_dummies(train)
train.columns
Out[9]:
In [10]:
train.head()
Out[10]:
In [11]:
train.info()
In [12]:
train['Age'].fillna(train['Age'].median(), inplace=True)
In [13]:
target = 'Survived'
IDcol = 'PassengerId'
In [14]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
alg.fit(dtrain[predictors], dtrain[target], eval_metric='auc')
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
print ("Auc Score(Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [15]:
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(learning_rate=0.1,
n_estimators=1000,
max_depth=7,
min_child_weight=4,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb1, train, predictors)
In [16]:
param_test1 = {
'max_depth':list(range(3,10,1)),
'min_child_weight':list(range(1,6,1))
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
Out[16]:
In [17]:
param_test2 = {
'gamma':[i/10.0 for i in list(range(0,5))]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=7,
min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[17]:
In [18]:
param_test3 = {
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=7,
min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
Out[18]:
In [19]:
param_test4 = {
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=7,
min_child_weight=4, gamma=0, subsample=0.6, colsample_bytree=0.9,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
Out[19]:
In [20]:
xgb2 = XGBClassifier(learning_rate=0.01,
n_estimators=1000,
max_depth=7,
min_child_weight=4,
gamma=0,
subsample=0.6,
colsample_bytree=0.9,
objective='binary:logistic',
nthread=4,
scale_pos_weight=1,
reg_alpha=0.005,
seed=27)
modelfit(xgb2, train, predictors)
In [ ]: