notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4









    



c:\users\administrator\anaconda3\envs\py3.5\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
c:\users\administrator\anaconda3\envs\py3.5\lib\site-packages\sklearn\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)



In [2]:

    
# Import the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [3]:

    
train.head()









    Out[3]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [4]:

    
train.dtypes









    Out[4]:





PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



In [5]:

    
for fea in [ 'Pclass', 'Sex', 'SibSp',
       'Parch', 'Embarked']:
    print("feature:%s" % fea)
    print(train[fea].value_counts(dropna=False))
    print()









    



feature:Pclass
3    491
1    216
2    184
Name: Pclass, dtype: int64

feature:Sex
male      577
female    314
Name: Sex, dtype: int64

feature:SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

feature:Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

feature:Embarked
S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64



In [6]:

    
# delete   'Name', 'Ticket', 'Cabin '
train.drop(['Name', 'Cabin', 'Ticket', 'Embarked'], axis=1, inplace=True)



In [7]:

    
train.head()









    Out[7]:







  
    
      
      PassengerId
      Survived
      Pclass
      Sex
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      0
      1
      0
      3
      male
      22.0
      1
      0
      7.2500
    
    
      1
      2
      1
      1
      female
      38.0
      1
      0
      71.2833
    
    
      2
      3
      1
      3
      female
      26.0
      0
      0
      7.9250
    
    
      3
      4
      1
      1
      female
      35.0
      1
      0
      53.1000
    
    
      4
      5
      0
      3
      male
      35.0
      0
      0
      8.0500



In [8]:

    
# train['Embarked'].fillna('S', inplace=True)



In [9]:

    
train = pd.get_dummies(train)
train.columns









    Out[9]:





Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male'],
      dtype='object')



In [10]:

    
train.head()









    Out[10]:







  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
      Sex_female
      Sex_male
    
  
  
    
      0
      1
      0
      3
      22.0
      1
      0
      7.2500
      0
      1
    
    
      1
      2
      1
      1
      38.0
      1
      0
      71.2833
      1
      0
    
    
      2
      3
      1
      3
      26.0
      0
      0
      7.9250
      1
      0
    
    
      3
      4
      1
      1
      35.0
      1
      0
      53.1000
      1
      0
    
    
      4
      5
      0
      3
      35.0
      0
      0
      8.0500
      0
      1



In [11]:

    
train.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Sex_female     891 non-null uint8
Sex_male       891 non-null uint8
dtypes: float64(2), int64(5), uint8(2)
memory usage: 50.5 KB



In [12]:

    
train['Age'].fillna(train['Age'].median(), inplace=True)



In [13]:

    
target = 'Survived'
IDcol = 'PassengerId'



In [14]:

    
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
        if useTrainCV:
            xgb_param = alg.get_xgb_params()
            xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                             metrics='auc', early_stopping_rounds=early_stopping_rounds)
            alg.set_params(n_estimators=cvresult.shape[0])

            alg.fit(dtrain[predictors], dtrain[target], eval_metric='auc')
        
        dtrain_predictions = alg.predict(dtrain[predictors])
        dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
        
        print ("\nModel Report")
        print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
        print ("Auc Score(Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
        
        feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')



In [15]:

    
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(learning_rate=0.1,
                   n_estimators=1000,
                   max_depth=7,
                   min_child_weight=4,
                   gamma=0,
                   subsample=0.8,
                   colsample_bytree=0.8,
                   objective='binary:logistic',
                   nthread=4,
                   scale_pos_weight=1,
                   seed=27)
modelfit(xgb1, train, predictors)









    



Model Report
Accuracy : 0.8889
Auc Score(Train): 0.948316



In [16]:

    
param_test1 = {
 'max_depth':list(range(3,10,1)),
 'min_child_weight':list(range(1,6,1))
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_









    Out[16]:





([mean: 0.86555, std: 0.03347, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: 0.86407, std: 0.02849, params: {'min_child_weight': 2, 'max_depth': 3},
  mean: 0.86439, std: 0.02812, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.86400, std: 0.02872, params: {'min_child_weight': 4, 'max_depth': 3},
  mean: 0.86686, std: 0.02575, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: 0.86824, std: 0.03383, params: {'min_child_weight': 1, 'max_depth': 4},
  mean: 0.86759, std: 0.02803, params: {'min_child_weight': 2, 'max_depth': 4},
  mean: 0.86695, std: 0.02788, params: {'min_child_weight': 3, 'max_depth': 4},
  mean: 0.86846, std: 0.02980, params: {'min_child_weight': 4, 'max_depth': 4},
  mean: 0.86596, std: 0.02757, params: {'min_child_weight': 5, 'max_depth': 4},
  mean: 0.86597, std: 0.03286, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: 0.86797, std: 0.02626, params: {'min_child_weight': 2, 'max_depth': 5},
  mean: 0.86721, std: 0.03036, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: 0.86756, std: 0.02867, params: {'min_child_weight': 4, 'max_depth': 5},
  mean: 0.86800, std: 0.02709, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: 0.86585, std: 0.03051, params: {'min_child_weight': 1, 'max_depth': 6},
  mean: 0.86839, std: 0.02823, params: {'min_child_weight': 2, 'max_depth': 6},
  mean: 0.86943, std: 0.02744, params: {'min_child_weight': 3, 'max_depth': 6},
  mean: 0.86740, std: 0.03047, params: {'min_child_weight': 4, 'max_depth': 6},
  mean: 0.86725, std: 0.02913, params: {'min_child_weight': 5, 'max_depth': 6},
  mean: 0.86415, std: 0.03196, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: 0.86832, std: 0.02762, params: {'min_child_weight': 2, 'max_depth': 7},
  mean: 0.86872, std: 0.02737, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: 0.86726, std: 0.02909, params: {'min_child_weight': 4, 'max_depth': 7},
  mean: 0.86755, std: 0.02759, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: 0.86357, std: 0.03358, params: {'min_child_weight': 1, 'max_depth': 8},
  mean: 0.86834, std: 0.02901, params: {'min_child_weight': 2, 'max_depth': 8},
  mean: 0.86934, std: 0.02949, params: {'min_child_weight': 3, 'max_depth': 8},
  mean: 0.86709, std: 0.02943, params: {'min_child_weight': 4, 'max_depth': 8},
  mean: 0.86886, std: 0.02865, params: {'min_child_weight': 5, 'max_depth': 8},
  mean: 0.86291, std: 0.03153, params: {'min_child_weight': 1, 'max_depth': 9},
  mean: 0.86905, std: 0.02829, params: {'min_child_weight': 2, 'max_depth': 9},
  mean: 0.86870, std: 0.02914, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: 0.86808, std: 0.02986, params: {'min_child_weight': 4, 'max_depth': 9},
  mean: 0.86822, std: 0.02825, params: {'min_child_weight': 5, 'max_depth': 9}],
 {'max_depth': 6, 'min_child_weight': 3},
 0.869430896520047)



In [17]:

    
param_test2 = {
 'gamma':[i/10.0 for i in list(range(0,5))]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=7,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_









    Out[17]:





([mean: 0.86726, std: 0.02909, params: {'gamma': 0.0},
  mean: 0.86797, std: 0.02913, params: {'gamma': 0.1},
  mean: 0.86799, std: 0.02965, params: {'gamma': 0.2},
  mean: 0.86775, std: 0.02838, params: {'gamma': 0.3},
  mean: 0.86712, std: 0.02915, params: {'gamma': 0.4}],
 {'gamma': 0.2},
 0.8679925655579707)



In [18]:

    
param_test3 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=7,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_









    Out[18]:





([mean: 0.87179, std: 0.02841, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.86867, std: 0.02741, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.86661, std: 0.02908, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.86712, std: 0.02993, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.87179, std: 0.02841, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.86867, std: 0.02741, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.86661, std: 0.02908, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.86712, std: 0.02993, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.87001, std: 0.02592, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.87071, std: 0.02669, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.86726, std: 0.02909, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.86837, std: 0.03138, params: {'colsample_bytree': 0.8, 'subsample': 0.9},
  mean: 0.87237, std: 0.02902, params: {'colsample_bytree': 0.9, 'subsample': 0.6},
  mean: 0.87147, std: 0.02866, params: {'colsample_bytree': 0.9, 'subsample': 0.7},
  mean: 0.86719, std: 0.03114, params: {'colsample_bytree': 0.9, 'subsample': 0.8},
  mean: 0.86909, std: 0.02938, params: {'colsample_bytree': 0.9, 'subsample': 0.9}],
 {'colsample_bytree': 0.9, 'subsample': 0.6},
 0.8723699754768643)



In [19]:

    
param_test4 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=7,
 min_child_weight=4, gamma=0, subsample=0.6, colsample_bytree=0.9,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_









    Out[19]:





([mean: 0.87237, std: 0.02902, params: {'reg_alpha': 0},
  mean: 0.87217, std: 0.02830, params: {'reg_alpha': 0.001},
  mean: 0.87074, std: 0.03026, params: {'reg_alpha': 0.005},
  mean: 0.86978, std: 0.02841, params: {'reg_alpha': 0.01},
  mean: 0.87067, std: 0.02740, params: {'reg_alpha': 0.05}],
 {'reg_alpha': 0},
 0.8723699754768643)



In [20]:

    
xgb2 = XGBClassifier(learning_rate=0.01,
                   n_estimators=1000,
                   max_depth=7,
                   min_child_weight=4,
                   gamma=0,
                   subsample=0.6,
                   colsample_bytree=0.9,
                   objective='binary:logistic',
                   nthread=4,
                   scale_pos_weight=1,
                   reg_alpha=0.005,
                   seed=27)
modelfit(xgb2, train, predictors)









    



Model Report
Accuracy : 0.8395
Auc Score(Train): 0.885848



In [ ]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S