notebook.community

Edit and run



In [350]:

    
# Imports
import pandas as pd
import numpy as np

# machine learning
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

# xgboost
import xgboost as xgb

# matplotlib
import matplotlib.pyplot as plt
%matplotlib inline



In [351]:

    
# 自定義的function

# 算 accuracy, precision, recall
def performance(clf, X_train, Y_train, cv_num = 4):
    scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='precision')
    print "precision is {}".format(scores.mean())
    
    scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='recall')
    print "recall is {}".format(scores.mean())

    scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='accuracy')
    print "accuracy is {}".format(scores.mean())



In [352]:

    
# get titanic & test csv files as a DataFrame
train = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/train.csv")
test = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/test.csv")
test_passengerId = test['PassengerId']



In [353]:

    
train.info()
print "--------------"
test.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
--------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB



In [354]:

    
#Combine into data:
train['source']= 'train'
test['source'] = 'test'
data=pd.concat([train, test],ignore_index=True)
data.shape









    Out[354]:





(1309, 13)



In [355]:

    
# 稍微看一下 data長怎樣
data.head()









    Out[355]:






  
    
      
      Age
      Cabin
      Embarked
      Fare
      Name
      Parch
      PassengerId
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      source
    
  
  
    
      0
      22.0
      NaN
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      1
      3
      male
      1
      0.0
      A/5 21171
      train
    
    
      1
      38.0
      C85
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      2
      1
      female
      1
      1.0
      PC 17599
      train
    
    
      2
      26.0
      NaN
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      3
      female
      0
      1.0
      STON/O2. 3101282
      train
    
    
      3
      35.0
      C123
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      4
      1
      female
      1
      1.0
      113803
      train
    
    
      4
      35.0
      NaN
      S
      8.0500
      Allen, Mr. William Henry
      0
      5
      3
      male
      0
      0.0
      373450
      train

Check missing:



In [356]:

    
data.apply(lambda x: sum(x.isnull()))









    Out[356]:





Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
source            0
dtype: int64

Look at categories of all object variables:



In [357]:

    
var = ['Sex','Ticket','Cabin','Embarked']
for v in var:
    print '\nFrequency count for variable %s'%v
    print data[v].value_counts()









    



Frequency count for variable Sex
male      843
female    466
Name: Sex, dtype: int64

Frequency count for variable Ticket
CA. 2343         11
CA 2144           8
1601              8
S.O.C. 14879      7
PC 17608          7
347077            7
347082            7
3101295           7
347088            6
113781            6
382652            6
19950             6
16966             5
PC 17757          5
W./C. 6608        5
349909            5
113503            5
220845            5
4133              5
PC 17760          4
PC 17755          4
24160             4
LINE              4
17421             4
230136            4
12749             4
SC/Paris 2123     4
113760            4
36928             4
C.A. 34651        4
                 ..
C.A. 24579        1
347062            1
323951            1
233478            1
315088            1
248723            1
347079            1
248726            1
347074            1
347075            1
347076            1
C.A. 18723        1
250652            1
347073            1
330935            1
248659            1
330932            1
330931            1
233866            1
345498            1
362316            1
382653            1
382651            1
382650            1
373450            1
364859            1
240261            1
31028             1
3101276           1
SCO/W 1585        1
Name: Ticket, dtype: int64

Frequency count for variable Cabin
C23 C25 C27        6
B57 B59 B63 B66    5
G6                 5
D                  4
B96 B98            4
F2                 4
F4                 4
C78                4
F33                4
C22 C26            4
B58 B60            3
E101               3
B51 B53 B55        3
C101               3
A34                3
E34                3
C68                2
C7                 2
C62 C64            2
E25                2
C92                2
C93                2
B77                2
B78                2
E24                2
E50                2
B35                2
D10 D12            2
C2                 2
E121               2
                  ..
A23                1
A26                1
A29                1
B24                1
C105               1
B4                 1
C128               1
D45                1
D6                 1
B3                 1
C53                1
E58                1
D34                1
B102               1
A32                1
E17                1
A16                1
F E69              1
D38                1
E39 E41            1
A14                1
E52                1
C91                1
B73                1
B39                1
C95                1
C99                1
B37                1
B30                1
A10                1
Name: Cabin, dtype: int64

Frequency count for variable Embarked
S    914
C    270
Q    123
Name: Embarked, dtype: int64

Missing values on Embarked



In [358]:

    
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(111)
ax = data.boxplot(column='Fare', by=['Embarked','Pclass'], ax=ax)
plt.axhline(y=80, color='green')
ax.set_title('', y=1.1)

data[data.Embarked.isnull()][['Fare', 'Pclass', 'Embarked']]

From the above boxplot, we should replace NA with C because most people who had Pclass 1 and Fare 80 would be Embarked C



In [359]:

    
data['Embarked'].fillna('C', inplace=True)

Missing values on Fare



In [360]:

    
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(111)
data[(data.Pclass==3)&(data.Embarked=='S')].Fare.hist(bins=100, ax=ax)
data[data.Fare.isnull()][['Pclass', 'Fare', 'Embarked']]
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.title('Histogram of Fare, Plcass 3 and Embarked S')

data[data.Fare.isnull()][['Pclass', 'Fare', 'Embarked']]









    Out[360]:






  
    
      
      Pclass
      Fare
      Embarked
    
  
  
    
      1043
      3
      NaN
      S



In [361]:

    
print ("The top 5 most common value of Fare")
data[(data.Pclass==3)&(data.Embarked=='S')].Fare.value_counts().head()









    



The top 5 most common value of Fare






    Out[361]:





8.0500    60
7.8958    43
7.7750    26
7.9250    23
7.8542    21
Name: Fare, dtype: int64



In [362]:

    
data['Fare'].fillna(8.05, inplace=True)

Replace the missing value of Cabin with U0



In [363]:

    
data['Cabin_Missing'] = data['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Cabin'].fillna('U0', inplace=True)

Feature Engineering

Create a feature, Names, to store the length of words in name.



In [364]:

    
import re
data['Names'] = data['Name'].map(lambda x: len(re.split(' ', x)))

Create a feature, Title.



In [365]:

    
title = data['Name'].map(lambda x: re.compile(', (.*?)\.').findall(x)[0])
title[title=='Mme'] = 'Mrs'
title[title.isin(['Ms','Mlle'])] = 'Miss'
title[title.isin(['Don', 'Jonkheer'])] = 'Sir'
title[title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'
title[title.isin(['Capt', 'Col', 'Major', 'Dr', 'Officer', 'Rev'])] = 'Officer'
data['Title'] = title
del title

Create a feature, Deck. It may represents the socioeconomic status.



In [366]:

    
deck = data['Cabin'].map( lambda x : re.compile("([a-zA-Z]+)").search(x).group())
data['Deck'] = deck
del deck



In [367]:

    
data.head()









    Out[367]:






  
    
      
      Age
      Cabin
      Embarked
      Fare
      Name
      Parch
      PassengerId
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      source
      Cabin_Missing
      Names
      Title
      Deck
    
  
  
    
      0
      22.0
      U0
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      1
      3
      male
      1
      0.0
      A/5 21171
      train
      1
      4
      Mr
      U
    
    
      1
      38.0
      C85
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      2
      1
      female
      1
      1.0
      PC 17599
      train
      0
      7
      Mrs
      C
    
    
      2
      26.0
      U0
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      3
      female
      0
      1.0
      STON/O2. 3101282
      train
      1
      3
      Miss
      U
    
    
      3
      35.0
      C123
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      4
      1
      female
      1
      1.0
      113803
      train
      0
      7
      Mrs
      C
    
    
      4
      35.0
      U0
      S
      8.0500
      Allen, Mr. William Henry
      0
      5
      3
      male
      0
      0.0
      373450
      train
      1
      4
      Mr
      U

Create a feature, Room. It may represents the geo lacation.



In [368]:

    
checker = re.compile("([0-9]+)")
def roomNum(x):
    nums = checker.search(x)
    if nums:
        return int(nums.group())+1
    else:
        return 1
rooms = data['Cabin'].map(lambda x: roomNum(x))



In [369]:

    
data['Cabin_Room'] = rooms / rooms.sum()
del checker, roomNum



In [370]:

    
data.head()









    Out[370]:






  
    
      
      Age
      Cabin
      Embarked
      Fare
      Name
      Parch
      PassengerId
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      source
      Cabin_Missing
      Names
      Title
      Deck
      Cabin_Room
    
  
  
    
      0
      22.0
      U0
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      1
      3
      male
      1
      0.0
      A/5 21171
      train
      1
      4
      Mr
      U
      0.000064
    
    
      1
      38.0
      C85
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      2
      1
      female
      1
      1.0
      PC 17599
      train
      0
      7
      Mrs
      C
      0.005496
    
    
      2
      26.0
      U0
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      3
      female
      0
      1.0
      STON/O2. 3101282
      train
      1
      3
      Miss
      U
      0.000064
    
    
      3
      35.0
      C123
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      4
      1
      female
      1
      1.0
      113803
      train
      0
      7
      Mrs
      C
      0.007924
    
    
      4
      35.0
      U0
      S
      8.0500
      Allen, Mr. William Henry
      0
      5
      3
      male
      0
      0.0
      373450
      train
      1
      4
      Mr
      U
      0.000064

Create a feature, Group_num. It may represents the size of family.



In [371]:

    
data['Group_num'] = data['Parch'] + data['SibSp'] + 1

Create a feature, Group_size. When the size is between 2 and 4, more people are survived.



In [372]:

    
def groupSize(x):
    if x > 4 :
        return 'L'
    elif x == 1 :
        return 'S'
    else:
        return 'M'
group_size = data['Group_num'].map(lambda x: groupSize(x))
data['Group_size'] = group_size



In [373]:

    
data.head()









    Out[373]:






  
    
      
      Age
      Cabin
      Embarked
      Fare
      Name
      Parch
      PassengerId
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      source
      Cabin_Missing
      Names
      Title
      Deck
      Cabin_Room
      Group_num
      Group_size
    
  
  
    
      0
      22.0
      U0
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      1
      3
      male
      1
      0.0
      A/5 21171
      train
      1
      4
      Mr
      U
      0.000064
      2
      M
    
    
      1
      38.0
      C85
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      2
      1
      female
      1
      1.0
      PC 17599
      train
      0
      7
      Mrs
      C
      0.005496
      2
      M
    
    
      2
      26.0
      U0
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      3
      female
      0
      1.0
      STON/O2. 3101282
      train
      1
      3
      Miss
      U
      0.000064
      1
      S
    
    
      3
      35.0
      C123
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      4
      1
      female
      1
      1.0
      113803
      train
      0
      7
      Mrs
      C
      0.007924
      2
      M
    
    
      4
      35.0
      U0
      S
      8.0500
      Allen, Mr. William Henry
      0
      5
      3
      male
      0
      0.0
      373450
      train
      1
      4
      Mr
      U
      0.000064
      1
      S



In [374]:

    
data.dtypes









    Out[374]:





Age              float64
Cabin             object
Embarked          object
Fare             float64
Name              object
Parch              int64
PassengerId        int64
Pclass             int64
Sex               object
SibSp              int64
Survived         float64
Ticket            object
source            object
Cabin_Missing      int64
Names              int64
Title             object
Deck              object
Cabin_Room       float64
Group_num          int64
Group_size        object
dtype: object

Normalized the fare.



In [375]:

    
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['Nor_Fare'] = pd.Series(scaler.fit_transform(data['Fare'].values.reshape(-1,1)).reshape(-1), index=data.index)

Numerical Coding:



In [376]:

    
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Embarked','Sex','Deck','Group_size','Title']
for col in var_to_encode:
    data[col] = le.fit_transform(data[col])

One-Hot Coding



In [377]:

    
data = pd.get_dummies(data, columns=var_to_encode)
data.columns









    Out[377]:





Index([u'Age', u'Cabin', u'Fare', u'Name', u'Parch', u'PassengerId', u'Pclass',
       u'SibSp', u'Survived', u'Ticket', u'source', u'Cabin_Missing', u'Names',
       u'Cabin_Room', u'Group_num', u'Nor_Fare', u'Embarked_0', u'Embarked_1',
       u'Embarked_2', u'Sex_0', u'Sex_1', u'Deck_0', u'Deck_1', u'Deck_2',
       u'Deck_3', u'Deck_4', u'Deck_5', u'Deck_6', u'Deck_7', u'Deck_8',
       u'Group_size_0', u'Group_size_1', u'Group_size_2', u'Title_0',
       u'Title_1', u'Title_2', u'Title_3', u'Title_4', u'Title_5', u'Title_6'],
      dtype='object')

Predict Age



In [378]:

    
label_y = data[data['source'] == 'train']['Survived']



In [379]:

    
from sklearn.model_selection import train_test_split
data.drop(labels=['PassengerId', 'Name', 'Cabin', 'Survived', 'Ticket', 'Fare'], axis=1, inplace=True)

X = data[data['Age'].notnull()].drop(['Age','source'], axis=1)
y = data[data['Age'].notnull()].Age
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def get_model(estimator, parameters, X_train, y_train, scoring):  
    model = GridSearchCV(estimator, param_grid=parameters, scoring=scoring)
    model.fit(X_train, y_train)
    return model.best_estimator_

import xgboost as xgb

XGB = xgb.XGBRegressor(max_depth=4, seed= 42)
scoring = make_scorer(mean_absolute_error, greater_is_better=False)
parameters = {'reg_alpha':np.linspace(0.1,1.0,5), 'reg_lambda': np.linspace(1.0,3.0,5)}
reg_xgb = get_model(XGB, parameters, X_train, y_train, scoring)
print (reg_xgb)

print ("Mean absolute error of test data: {}".format(mean_absolute_error(y_test, reg_xgb.predict(X_test))))









    



XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0.77500000000000002,
       reg_lambda=3.0, scale_pos_weight=1, seed=42, silent=True,
       subsample=1)
Mean absolute error of test data: 7.74417822386



In [380]:

    
fig = plt.figure(figsize=(15, 6))
alpha = 0.5
data['Age'].value_counts().plot(kind='density', color='#FA2379', label='Before', alpha=alpha)

pred = reg_xgb.predict(data[data['Age'].isnull()].drop(['Age','source'], axis=1))
data.set_value(data['Age'].isnull(), 'Age', pred)

data['Age'].value_counts().plot(kind='density', label='After', alpha=alpha)
plt.xlabel('Age')
plt.title("What's the distribution of Age after predicting?" )
plt.legend(loc='best')
plt.grid()

Separate train & test:



In [381]:

    
# label_y
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']

train.drop('source',axis=1,inplace=True)
test.drop('source',axis=1,inplace=True)









    



/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/IPython/kernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/IPython/kernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Build Model



In [382]:

    
def modelfit(alg, train, label_y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train, label=label_y)
        
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train, label_y,eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(train)
    dtrain_predprob = alg.predict_proba(train)[:,1]
    
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(label_y, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(label_y, dtrain_predprob)
                
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')



In [383]:

    
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics

xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
#         scale_pos_weight=1,
        seed=27)
modelfit(xgb1, train, label_y)









    



Model Report
Accuracy : 0.9046
AUC Score (Train): 0.965328



In [384]:

    
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, seed=27), 
                       param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train,label_y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_









    



/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/sklearn/model_selection/_search.py:662: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)






    Out[384]:





([mean: 0.87893, std: 0.03059, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.87536, std: 0.02634, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.87735, std: 0.02502, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.87243, std: 0.02958, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.87618, std: 0.02258, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.87828, std: 0.02639, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.87385, std: 0.02618, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.87551, std: 0.02419, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.87715, std: 0.02570, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.87136, std: 0.02637, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.87752, std: 0.02349, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.87897, std: 0.02562, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 9, 'min_child_weight': 5},
 0.87896774770393338)



In [385]:

    
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test2 = {
    'max_depth':[8,9,10,11,12],
    'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4,seed=27), 
                       param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train,label_y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_









    



/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/sklearn/model_selection/_search.py:662: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)






    Out[385]:





([mean: 0.87625, std: 0.02361, params: {'max_depth': 8, 'min_child_weight': 4},
  mean: 0.87830, std: 0.02551, params: {'max_depth': 8, 'min_child_weight': 5},
  mean: 0.87881, std: 0.02742, params: {'max_depth': 8, 'min_child_weight': 6},
  mean: 0.87717, std: 0.02553, params: {'max_depth': 9, 'min_child_weight': 4},
  mean: 0.87897, std: 0.02562, params: {'max_depth': 9, 'min_child_weight': 5},
  mean: 0.87856, std: 0.02724, params: {'max_depth': 9, 'min_child_weight': 6},
  mean: 0.87505, std: 0.02526, params: {'max_depth': 10, 'min_child_weight': 4},
  mean: 0.87940, std: 0.02613, params: {'max_depth': 10, 'min_child_weight': 5},
  mean: 0.87856, std: 0.02724, params: {'max_depth': 10, 'min_child_weight': 6},
  mean: 0.87590, std: 0.02531, params: {'max_depth': 11, 'min_child_weight': 4},
  mean: 0.87940, std: 0.02613, params: {'max_depth': 11, 'min_child_weight': 5},
  mean: 0.87856, std: 0.02724, params: {'max_depth': 11, 'min_child_weight': 6},
  mean: 0.87590, std: 0.02531, params: {'max_depth': 12, 'min_child_weight': 4},
  mean: 0.87940, std: 0.02613, params: {'max_depth': 12, 'min_child_weight': 5},
  mean: 0.87856, std: 0.02724, params: {'max_depth': 12, 'min_child_weight': 6}],
 {'max_depth': 10, 'min_child_weight': 5},
 0.87940233655923838)



In [386]:

    
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test3 = {
    'gamma':[i/10.0 for i in range(0,15)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
                                        min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4,seed=27), 
                       param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train,label_y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_









    



/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/sklearn/model_selection/_search.py:662: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)






    Out[386]:





([mean: 0.87940, std: 0.02613, params: {'gamma': 0.0},
  mean: 0.87902, std: 0.02441, params: {'gamma': 0.1},
  mean: 0.87987, std: 0.02615, params: {'gamma': 0.2},
  mean: 0.87834, std: 0.02624, params: {'gamma': 0.3},
  mean: 0.88052, std: 0.02710, params: {'gamma': 0.4},
  mean: 0.87906, std: 0.02691, params: {'gamma': 0.5},
  mean: 0.87959, std: 0.02701, params: {'gamma': 0.6},
  mean: 0.87720, std: 0.02630, params: {'gamma': 0.7},
  mean: 0.87982, std: 0.02703, params: {'gamma': 0.8},
  mean: 0.88144, std: 0.02722, params: {'gamma': 0.9},
  mean: 0.88019, std: 0.02646, params: {'gamma': 1.0},
  mean: 0.87760, std: 0.02703, params: {'gamma': 1.1},
  mean: 0.87982, std: 0.02778, params: {'gamma': 1.2},
  mean: 0.87996, std: 0.02759, params: {'gamma': 1.3},
  mean: 0.87910, std: 0.02855, params: {'gamma': 1.4}],
 {'gamma': 0.9},
 0.88143902243059902)



In [387]:

    
xgb2 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=10,
        min_child_weight=5,
        gamma=0.9,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        seed=27)
modelfit(xgb2, train, label_y)









    



Model Report
Accuracy : 0.8866
AUC Score (Train): 0.943739



In [388]:

    
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train,label_y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_









    



/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/sklearn/model_selection/_search.py:662: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)






    Out[388]:





([mean: 0.87746, std: 0.02949, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
  mean: 0.87543, std: 0.02824, params: {'subsample': 0.7, 'colsample_bytree': 0.6},
  mean: 0.87706, std: 0.02779, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
  mean: 0.87699, std: 0.02588, params: {'subsample': 0.9, 'colsample_bytree': 0.6},
  mean: 0.87631, std: 0.02650, params: {'subsample': 0.6, 'colsample_bytree': 0.7},
  mean: 0.88022, std: 0.02611, params: {'subsample': 0.7, 'colsample_bytree': 0.7},
  mean: 0.87843, std: 0.02938, params: {'subsample': 0.8, 'colsample_bytree': 0.7},
  mean: 0.87687, std: 0.02696, params: {'subsample': 0.9, 'colsample_bytree': 0.7},
  mean: 0.87708, std: 0.02594, params: {'subsample': 0.6, 'colsample_bytree': 0.8},
  mean: 0.87841, std: 0.02662, params: {'subsample': 0.7, 'colsample_bytree': 0.8},
  mean: 0.87855, std: 0.02866, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
  mean: 0.87633, std: 0.02571, params: {'subsample': 0.9, 'colsample_bytree': 0.8},
  mean: 0.87680, std: 0.02452, params: {'subsample': 0.6, 'colsample_bytree': 0.9},
  mean: 0.87521, std: 0.02540, params: {'subsample': 0.7, 'colsample_bytree': 0.9},
  mean: 0.87856, std: 0.02703, params: {'subsample': 0.8, 'colsample_bytree': 0.9},
  mean: 0.87671, std: 0.02509, params: {'subsample': 0.9, 'colsample_bytree': 0.9}],
 {'colsample_bytree': 0.7, 'subsample': 0.7},
 0.88022213026342633)



In [389]:

    
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test5 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train,label_y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_









    



/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/sklearn/model_selection/_search.py:662: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)






    Out[389]:





([mean: 0.87833, std: 0.02850, params: {'reg_alpha': 1e-05},
  mean: 0.87540, std: 0.02777, params: {'reg_alpha': 0.01},
  mean: 0.87970, std: 0.02898, params: {'reg_alpha': 0.1},
  mean: 0.87932, std: 0.02736, params: {'reg_alpha': 1},
  mean: 0.79067, std: 0.03408, params: {'reg_alpha': 100}],
 {'reg_alpha': 0.1},
 0.87969533790235821)



In [390]:

    
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test6 = {
    'reg_alpha':[0, 0.01, 0.05, 0.1, 0.16, 0.19]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(train,label_y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_









    



/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/sklearn/model_selection/_search.py:662: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)






    Out[390]:





([mean: 0.87833, std: 0.02850, params: {'reg_alpha': 0},
  mean: 0.87540, std: 0.02777, params: {'reg_alpha': 0.01},
  mean: 0.87725, std: 0.02746, params: {'reg_alpha': 0.05},
  mean: 0.87970, std: 0.02898, params: {'reg_alpha': 0.1},
  mean: 0.87948, std: 0.02727, params: {'reg_alpha': 0.16},
  mean: 0.87817, std: 0.02814, params: {'reg_alpha': 0.19}],
 {'reg_alpha': 0.1},
 0.87969533790235821)



In [391]:

    
xgb3 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=10,
        min_child_weight=5,
        gamma=0.9,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_alpha=0.1,
        objective= 'binary:logistic',
        nthread=4,
        seed=27)
modelfit(xgb3, train, label_y)









    



Model Report
Accuracy : 0.8866
AUC Score (Train): 0.948585



In [392]:

    
xgb4 = XGBClassifier(
        learning_rate =0.01,
        n_estimators=5000,
        max_depth=10,
        min_child_weight=5,
        gamma=0.9,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_alpha=0.1,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb4, train, label_y)









    



Model Report
Accuracy : 0.8462
AUC Score (Train): 0.887035

Make submission



In [393]:

    
test_predict = xgb4.predict(test)



In [394]:

    
submission = pd.DataFrame({
        "PassengerId": test_passengerId,
        "Survived": test_predict
    })



In [402]:

    
submission['Survived'] = submission['Survived'].astype('int64')



In [404]:

    
submission.to_csv('/Users/wy/Desktop/titanic_xgboost2.csv', index=False)

	Age	Cabin	Embarked	Fare	Name	PassengerId	Pclass	Sex	SibSp	Survived	Ticket	source
0	22.0	NaN	S	7.2500	Braund, Mr. Owen Harris	1	3	male	1	0.0	A/5 21171	train
1	38.0	C85	C	71.2833	Cumings, Mrs. John Bradley (Florence Briggs Th...	2	1	female	1	1.0	PC 17599	train
2	26.0	NaN	S	7.9250	Heikkinen, Miss. Laina	3	3	female	0	1.0	STON/O2. 3101282	train
3	35.0	C123	S	53.1000	Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	1	female	1	1.0	113803	train
4	35.0	NaN	S	8.0500	Allen, Mr. William Henry	5	3	male	0	0.0	373450	train