In [20]:

    
# TODO: read about feature selection / combination in pipelines
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# http://scikit-learn.org/stable/auto_examples/feature_stacker.html
# http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html

# https://github.com/paulgb/sklearn-pandas



In [66]:

    
# global settings
CORES = -1        # parallelization
RND = 123        # random seed



In [67]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
sns.set(style="white", palette="muted", color_codes=True)
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import pipeline
from sklearn import metrics
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import tree
from sklearn import cross_validation
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.grid_search import GridSearchCV
import xgboost as xgb

Data Load



In [35]:

    
def fetch(fname,
          drop=['PassengerId', 'Name', 'Ticket', 'Cabin'],
          add_derived=True):
    """
    add_title   : extract title (e.g., Mr, Ms, Mrs, etc) from passanger name
    """
    drop = drop or []
    df = pd.read_csv(fname)
    # optional: extract title from name
    if add_derived and 'Name' in df.columns:
        df['Title'] = (df['Name']
                       .apply(lambda x: x.split(',')[1].split()[0] if ',' in x else np.Nan)
                       .apply(lambda x: x[:-1] if x.endswith('.') else x))
    if add_derived and 'Cabin' in df.columns:
        df['Deck'] = df['Cabin'].str[:1].fillna('X')
    if add_derived and 'Ticket' in df.columns:
        df['TicketNum'] = df.Ticket.str.isnumeric().astype(np.float64)
    df_train.Ticket.str.isnumeric()
    drop = [col for col in drop if col in df.columns]
    df = df.drop(drop, axis=1)
    return df



In [36]:

    
# data
df = df_train = fetch('titanic/train.csv', drop=None)
df_comp  = fetch('titanic/test.csv', drop=None)
df.drop('Name', axis=1).head(5)









    Out[36]:






  
    
      
      PassengerId
      Survived
      Pclass
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Title
      Deck
      TicketNum
    
  
  
    
      0
      1
      0
      3
      male
      22
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      Mr
      X
      0
    
    
      1
      2
      1
      1
      female
      38
      1
      0
      PC 17599
      71.2833
      C85
      C
      Mrs
      C
      0
    
    
      2
      3
      1
      3
      female
      26
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      Miss
      X
      0
    
    
      3
      4
      1
      1
      female
      35
      1
      0
      113803
      53.1000
      C123
      S
      Mrs
      C
      1
    
    
      4
      5
      0
      3
      male
      35
      0
      0
      373450
      8.0500
      NaN
      S
      Mr
      X
      1

Visualization



In [6]:

    
sns.pairplot(df_train, x_vars=['Age','Fare'], y_vars='Survived', size=7, aspect=0.7)









    



/Users/Anton/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))






    Out[6]:





<seaborn.axisgrid.PairGrid at 0x10633ef50>



In [7]:

    
sns.pairplot(df_train, x_vars=['Age'], y_vars='Fare', size=7, aspect=0.7)









    Out[7]:





<seaborn.axisgrid.PairGrid at 0x1107b9090>



In [8]:

    
# Set up the matplotlib figure
f, axes = plt.subplots(2, figsize=(7, 7), sharex=False)
sns.despine(left=True)

sns.distplot(df_train['Fare'].dropna(), hist=True, rug=True, color="b", ax=axes[0])
sns.distplot(df_train['Age'].dropna(), hist=True, rug=True, color="r", ax=axes[1])









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x1112b9f90>

Define features



In [9]:

    
X, y = df_train.ix[:,1:], df_train.Survived
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
   df_train.ix[:,1:], df_train.Survived, test_size=0.4, random_state=RND
)
kf_total = KFold(len(X), n_folds=10, shuffle=True, random_state=RND)
X.drop('Name', axis=1).head(5)









    Out[9]:






  
    
      
      Survived
      Pclass
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Title
      Deck
    
  
  
    
      0
      0
      3
      male
      22
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      Mr
      X
    
    
      1
      1
      1
      female
      38
      1
      0
      PC 17599
      71.2833
      C85
      C
      Mrs
      C
    
    
      2
      1
      3
      female
      26
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      Miss
      X
    
    
      3
      1
      1
      female
      35
      1
      0
      113803
      53.1000
      C123
      S
      Mrs
      C
    
    
      4
      0
      3
      male
      35
      0
      0
      373450
      8.0500
      NaN
      S
      Mr
      X



In [8]:

    
mapper = DataFrameMapper([
     ('Sex', preprocessing.LabelBinarizer()),
     ('Title', preprocessing.LabelBinarizer()),
     (['Age'], [preprocessing.Imputer(), preprocessing.StandardScaler()]),
     (['Fare'], [preprocessing.Imputer(),preprocessing.StandardScaler()]),
     (['SibSp'], preprocessing.Imputer(strategy='most_frequent')),
     (['Pclass'], preprocessing.Imputer(strategy='most_frequent')),
     (['Parch'], preprocessing.Imputer(strategy='most_frequent'))
    ])
# demo:
mapper.fit_transform(df_train.copy())[0]









    Out[8]:





array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.5924806 , -0.50244517,
        1.        ,  3.        ,  0.        ])



In [10]:

    
# alternative feature mapper inspired by eyebervil's feature selection
# things to consider:
# - convert titles to Mr. / Mrs.
# - define FamilySize = SibSp + Parch + 1
# - exploit cabin information
# - impute age via regression (how can this be structured in DataFrameMapper?)

# prepare a vectorized method that will standardize titles to one of: Mr, Mm, Ms
tmap = {}
tmap.update({k: 'Mr' for k in ("Mr", "Capt", "Master", "Col", "Don", "Jonkheer", "Major", "Rev", "Sir")})
tmap.update({k: 'Mm' for k in ("Mrs", "Dona", "Lady", "Mme", "Lady", "the Countess")})
tmap.update({k: 'Ms' for k in ("Mlle", "Ms", "Miss")})
lup = dict(male=tmap.copy(), female=tmap.copy())
lup['male']['Dr'] = 'Mr'
lup['female']['Dr'] = 'Mm'
lup_default = 'Mm'
def standardize_title(F):
    title, sex = F
    res = lup.get(sex, {}).get(title, lup_default)
    #print lup.get(sex, {})
    #print title, sex, res
    return res
def vectorized_standardize_title(F):
    return np.apply_along_axis(standardize_title, 1, F)
std_title = vectorized_standardize_title
def compute_family_size(F):
    return np.sum(F[:,0:2], axis=1)+1

mapper2 = DataFrameMapper([
     ('Sex', preprocessing.LabelBinarizer()),
     (['Age'], [preprocessing.Imputer(), preprocessing.StandardScaler()]),
     (['Fare'], [preprocessing.Imputer(),preprocessing.StandardScaler()]),
     (['Pclass'], preprocessing.Imputer(strategy='most_frequent')),
     # title group
     (['Title', 'Sex'], [preprocessing.FunctionTransformer(
                    std_title,
                    validate=False
                ),
                preprocessing.LabelBinarizer()]),
     # family size
     (['SibSp', 'Parch'], preprocessing.FunctionTransformer(compute_family_size)),
     # cabin information: check for presence
     #('Cabin', preprocessing.FunctionTransformer(lambda F: (pd.isnull(F)).astype(np.float64), validate=False)),
     # deck extracted from cabin
     ('Deck', preprocessing.LabelBinarizer())
    ])
# demo
mapper2.fit_transform(df_train.copy())[0:1]









    Out[10]:





array([[ 1.        , -0.5924806 , -0.50244517,  3.        ,  0.        ,
         1.        ,  0.        ,  2.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ]])



In [44]:

    
# mapper for tree-based regressions
mapper3 = DataFrameMapper([
     ('Sex', preprocessing.LabelEncoder()),
     ('Title', preprocessing.LabelEncoder()),
     (['Age'], [preprocessing.Imputer(), preprocessing.StandardScaler()]),
     (['Fare'], [preprocessing.Imputer(),preprocessing.StandardScaler()]),
     (['SibSp'], preprocessing.Imputer(strategy='most_frequent')),
     (['Pclass'], preprocessing.Imputer(strategy='most_frequent')),
     (['Parch'], preprocessing.Imputer(strategy='most_frequent')),
     ('Deck', preprocessing.LabelEncoder()),    # Titanic deck as extracted from the Cabin
     ('TicketNum', None)  # fully numeric ticket, or with a value?
    ])
# demo:
mapper3.fit_transform(df_train.copy())[0:2]









    Out[44]:





array([[  1.        ,  11.        ,  -0.5924806 ,  -0.50244517,
          1.        ,   3.        ,   0.        ,   8.        ,   0.        ],
       [  0.        ,  12.        ,   0.63878901,   0.78684529,
          1.        ,   1.        ,   0.        ,   2.        ,   0.        ]])

Build pipeline



In [11]:

    
model = dict()

Random Forest



In [12]:

    
cls = 'rf'
mod = model[cls] = dict()
mod['classifier'] = ensemble.RandomForestClassifier(n_estimators=100, oob_score=True, n_jobs=-1, random_state=RND)
mod['params'] = dict(model__n_estimators=[100], model__random_state=[RND])
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)



In [77]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.81 (+/- 0.05)

SVM



In [16]:

    
cls='svm'
mod = model[cls] = dict()
mod['classifier'] = svm.SVC(random_state=RND)
mod['params'] = dict(model__kernel=['rbf', 'linear'], model__random_state=[RND])
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)



In [322]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))
print("Accuracy: %0.3f" % (model['svm']['model'].score(df_train.copy(), df_train.Survived)))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.832 (+/- 0.031)
Accuracy: 0.835
[[490  59]
 [ 88 254]]

SVM 2 (add poly, use alt. features)



In [332]:

    
cls='svm2'
mod = model[cls] = dict()
mod['classifier'] = svm.SVC(random_state=RND)
mod['params'] = dict(model__kernel=['rbf', 'linear', 'poly'],
                     model__C=[0.1, 1.0, 10.0],
                     model__random_state=[RND])
mod['pipe'] = pipeline.Pipeline([('featurize', mapper2), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)



In [333]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))
print("Accuracy: %0.3f" % (model['svm']['model'].score(df_train.copy(), df_train.Survived)))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.826 (+/- 0.036)
Accuracy: 0.835
[[490  59]
 [ 85 257]]

SVM 3: like svm, but only linear and trying values of regularizer



In [17]:

    
cls='svm3'
mod = model[cls] = dict()
mod['classifier'] = svm.SVC(random_state=RND)
mod['params'] = dict(model__kernel=['linear'], model__random_state=[RND],
                     model__C=[0.1, 1.0, 10.0],)
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)



In [19]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.825 (+/- 0.031)
[[482  67]
 [ 87 255]]

Ada Boost



In [319]:

    
cls='adaboost'
mod = model[cls] = dict()
mod['classifier'] = ensemble.AdaBoostClassifier(random_state=RND)
mod['params'] = {'model__learning_rate': [0.3, 1.0, 1.3],
                 'model__base_estimator': [tree.DecisionTreeClassifier()]}
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)



In [320]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Accuracy: %0.3f" % (model['svm']['model'].score(df_train.copy(), df_train.Survived)))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
# OK this is clearly overfitted...









    



Accuracy: 0.78 (+/- 0.05)
Accuracy: 0.835
[[547   2]
 [ 12 330]]

Logistic Regression



In [292]:

    
cls = 'logit'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])



In [294]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.3f (+/- %0.3f)" % (
    np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
    np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.833 (+/- 0.027)
[[483  66]
 [ 89 253]]

Logistic Regression v2 (engineered features)



In [255]:

    
cls = 'logit2'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper2), ('model', mod['classifier'])])



In [277]:

    
mod = model['logit2']
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.3f (+/- %0.3f)" % (
    np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
    np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.817 (+/- 0.021)
[[476  73]
 [ 94 248]]

Logistic with PCA



In [272]:

    
cls = 'logpca'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper),
                                  ('pca', decomposition.PCA(n_components=10)),
                                  ('logit', mod['classifier'])])



In [278]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.3f (+/- %0.3f)" % (
    np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
    np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.831 (+/- 0.026)
[[483  66]
 [ 88 254]]

Logistic with polynomial features



In [268]:

    
cls = 'logpol'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper), 
                                  ('polynomial', preprocessing.PolynomialFeatures(interaction_only=True,
                                                                                 degree=2)),
                                  ('model', mod['classifier'])])



In [271]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.2f (+/- %0.2f)" % (
    np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
    np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)









    



Accuracy: 0.84 (+/- 0.03)
[[495  54]
 [ 89 253]]

xgboost



In [50]:

    
Y_train = df_train.Survived.as_matrix()
X_train = mapper3.fit_transform(df_train.copy())
dtrain = xgb.DMatrix(X_train, label=Y_train)



In [ ]:

    
cls = 'xgb1'
mod = model[cls] = dict()
mod['classifier'] = xgb.XGBClassifier()
mod['params'] = {'model__max_depth': [2, 4, 6],
                 'model__n_estimators': [50, 100, 200]}
mod['pipe'] = pipeline.Pipeline([('featurize', mapper3), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)



In [ ]:

    
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
                            'Survived' : mod['model'].predict(df_comp.copy())})



In [ ]:

    
print(mod['pipe'].best_score_)
print(mod['pipe'].best_params_)



In [73]:

    
kf = cross_validation.KFold(Y_train.shape[0], n_folds=2, shuffle=True, random_state=RND)
for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier().fit(X_train[train_index],Y_train[train_index])
    predictions = xgb_model.predict(X_train[test_index])
    actuals = Y_train[test_index]
    print(metrics.confusion_matrix(actuals, predictions))









    



[[243  44]
 [ 31 128]]
[[242  20]
 [ 66 117]]



In [71]:

    
predictions









    Out[71]:





array([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0])



In [64]:

    
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic',
         'nthread': 4}
num_round = 2
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 1)









    



running cross validation






    Out[64]:






  
    
      
      test-error-mean
      test-error-std
      train-error-mean
      train-error-std
    
  
  
    
      0
      0.229213
      0.044150
      0.208989
      0.011798
    
    
      1
      0.207865
      0.034994
      0.190169
      0.009426



In [46]:

    
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=1000, learning_rate=0.05).fit(Xs, Ys)
predictions = gbm.predict(mapper.fit_transform(df_comp.copy()))
metrics.accuracy_score(Ys, gbm.predict(Xs))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-46-cc73072b7b12> in <module>()
      1 import xgboost as xgb
----> 2 gbm = xgb.XGBClassifier(max_depth=4, n_estimators=1000, learning_rate=0.05).fit(Xs, Ys)
      3 predictions = gbm.predict(mapper.fit_transform(df_comp.copy()))
      4 metrics.accuracy_score(Ys, gbm.predict(Xs))

NameError: name 'Xs' is not defined

Old stuff



In [280]:

    
# TODO: this was trained on full sample!
if False:
    # https://github.com/justmarkham/DAT7/blob/master/notebooks/12_advanced_model_evaluation.ipynb
    # predict probability of survival
    y_pred_prob = pipe.predict_proba(df_train)[:, 1]
    # plot ROC curve
    fpr, tpr, thresholds = metrics.roc_curve(df_train.Survived, y_pred_prob)
    plt.plot(fpr, tpr)
    plt.xlim([0.0, 0.3])
    plt.ylim([0.7, 1.0])
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Title	Deck	TicketNum
0	1	0	3	male	22	1	A/5 21171	7.2500	NaN	S	Mr	X	0
1	2	1	1	female	38	1	PC 17599	71.2833	C85	C	Mrs	C	0
2	3	1	3	female	26	0	STON/O2. 3101282	7.9250	NaN	S	Miss	X	0
3	4	1	1	female	35	1	113803	53.1000	C123	S	Mrs	C	1
4	5	0	3	male	35	0	373450	8.0500	NaN	S	Mr	X	1

	test-error-mean	test-error-std	train-error-mean	train-error-std
0	0.229213	0.044150	0.208989	0.011798
1	0.207865	0.034994	0.190169	0.009426