In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
sns.set(style="white", palette="muted", color_codes=True)
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import pipeline
from sklearn import metrics
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_predict  #, cross_val_score
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV


/Users/Anton/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [2]:
def fetch(fname,
          drop=['PassengerId', 'Name', 'Ticket', 'Cabin'],
          add_derived=True):
    """
    add_title   : extract title (e.g., Mr, Ms, Mrs, etc) from passanger name
    """
    drop = drop or []
    df = pd.read_csv(fname)
    # optional: extract title from name
    if add_derived and 'Name' in df.columns:
        df['Title'] = (df['Name']
                       .apply(lambda x: x.split(',')[1].split()[0] if ',' in x else np.Nan)
                       .apply(lambda x: x[:-1] if x.endswith('.') else x))
    if add_derived and 'Cabin' in df.columns:
        df['Deck'] = df['Cabin'].str[:1].fillna('X')
    drop = [col for col in drop if col in df.columns]
    df = df.drop(drop, axis=1)
    return df
# data
df = df_train = fetch('titanic/train.csv', drop=None)
df_comp  = fetch('titanic/test.csv', drop=None)

In [13]:
mapper = DataFrameMapper([
     ('Sex', preprocessing.LabelBinarizer()),
     ('Title', preprocessing.LabelBinarizer()),
     (['Age'], [preprocessing.Imputer(), preprocessing.StandardScaler()]),
     (['Fare'], [preprocessing.Imputer(),preprocessing.StandardScaler()]),
     (['SibSp'], preprocessing.Imputer(strategy='most_frequent')),
     (['Pclass'], preprocessing.Imputer(strategy='most_frequent')),
     (['Parch'], preprocessing.Imputer(strategy='most_frequent'))
    ])
# demo:
Ys = df_train.Survived.as_matrix()
Xs = mapper.fit_transform(df_train.copy())

In [4]:
import xgboost as xgb

In [39]:
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=1000, learning_rate=0.05).fit(Xs, Ys)
predictions = gbm.predict(mapper.fit_transform(df_comp.copy()))

In [40]:
metrics.accuracy_score(Ys, gbm.predict(Xs))


Out[40]:
0.9494949494949495

In [44]:
df_comp.copy()


Out[44]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title Deck
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q Mr X
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S Mrs X
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q Mr X
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S Mr X
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S Mrs X
5 897 3 Svensson, Mr. Johan Cervin male 14.0 0 0 7538 9.2250 NaN S Mr X
6 898 3 Connolly, Miss. Kate female 30.0 0 0 330972 7.6292 NaN Q Miss X
7 899 2 Caldwell, Mr. Albert Francis male 26.0 1 1 248738 29.0000 NaN S Mr X
8 900 3 Abrahim, Mrs. Joseph (Sophie Halaut Easu) female 18.0 0 0 2657 7.2292 NaN C Mrs X
9 901 3 Davies, Mr. John Samuel male 21.0 2 0 A/4 48871 24.1500 NaN S Mr X
10 902 3 Ilieff, Mr. Ylio male NaN 0 0 349220 7.8958 NaN S Mr X
11 903 1 Jones, Mr. Charles Cresson male 46.0 0 0 694 26.0000 NaN S Mr X
12 904 1 Snyder, Mrs. John Pillsbury (Nelle Stevenson) female 23.0 1 0 21228 82.2667 B45 S Mrs B
13 905 2 Howard, Mr. Benjamin male 63.0 1 0 24065 26.0000 NaN S Mr X
14 906 1 Chaffee, Mrs. Herbert Fuller (Carrie Constance... female 47.0 1 0 W.E.P. 5734 61.1750 E31 S Mrs E
15 907 2 del Carlo, Mrs. Sebastiano (Argenia Genovesi) female 24.0 1 0 SC/PARIS 2167 27.7208 NaN C Mrs X
16 908 2 Keane, Mr. Daniel male 35.0 0 0 233734 12.3500 NaN Q Mr X
17 909 3 Assaf, Mr. Gerios male 21.0 0 0 2692 7.2250 NaN C Mr X
18 910 3 Ilmakangas, Miss. Ida Livija female 27.0 1 0 STON/O2. 3101270 7.9250 NaN S Miss X
19 911 3 Assaf Khalil, Mrs. Mariana (Miriam")" female 45.0 0 0 2696 7.2250 NaN C Mrs X
20 912 1 Rothschild, Mr. Martin male 55.0 1 0 PC 17603 59.4000 NaN C Mr X
21 913 3 Olsen, Master. Artur Karl male 9.0 0 1 C 17368 3.1708 NaN S Master X
22 914 1 Flegenheim, Mrs. Alfred (Antoinette) female NaN 0 0 PC 17598 31.6833 NaN S Mrs X
23 915 1 Williams, Mr. Richard Norris II male 21.0 0 1 PC 17597 61.3792 NaN C Mr X
24 916 1 Ryerson, Mrs. Arthur Larned (Emily Maria Borie) female 48.0 1 3 PC 17608 262.3750 B57 B59 B63 B66 C Mrs B
25 917 3 Robins, Mr. Alexander A male 50.0 1 0 A/5. 3337 14.5000 NaN S Mr X
26 918 1 Ostby, Miss. Helene Ragnhild female 22.0 0 1 113509 61.9792 B36 C Miss B
27 919 3 Daher, Mr. Shedid male 22.5 0 0 2698 7.2250 NaN C Mr X
28 920 1 Brady, Mr. John Bertram male 41.0 0 0 113054 30.5000 A21 S Mr A
29 921 3 Samaan, Mr. Elias male NaN 2 0 2662 21.6792 NaN C Mr X
... ... ... ... ... ... ... ... ... ... ... ... ... ...
388 1280 3 Canavan, Mr. Patrick male 21.0 0 0 364858 7.7500 NaN Q Mr X
389 1281 3 Palsson, Master. Paul Folke male 6.0 3 1 349909 21.0750 NaN S Master X
390 1282 1 Payne, Mr. Vivian Ponsonby male 23.0 0 0 12749 93.5000 B24 S Mr B
391 1283 1 Lines, Mrs. Ernest H (Elizabeth Lindsey James) female 51.0 0 1 PC 17592 39.4000 D28 S Mrs D
392 1284 3 Abbott, Master. Eugene Joseph male 13.0 0 2 C.A. 2673 20.2500 NaN S Master X
393 1285 2 Gilbert, Mr. William male 47.0 0 0 C.A. 30769 10.5000 NaN S Mr X
394 1286 3 Kink-Heilmann, Mr. Anton male 29.0 3 1 315153 22.0250 NaN S Mr X
395 1287 1 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) female 18.0 1 0 13695 60.0000 C31 S Mrs C
396 1288 3 Colbert, Mr. Patrick male 24.0 0 0 371109 7.2500 NaN Q Mr X
397 1289 1 Frolicher-Stehli, Mrs. Maxmillian (Margaretha ... female 48.0 1 1 13567 79.2000 B41 C Mrs B
398 1290 3 Larsson-Rondberg, Mr. Edvard A male 22.0 0 0 347065 7.7750 NaN S Mr X
399 1291 3 Conlon, Mr. Thomas Henry male 31.0 0 0 21332 7.7333 NaN Q Mr X
400 1292 1 Bonnell, Miss. Caroline female 30.0 0 0 36928 164.8667 C7 S Miss C
401 1293 2 Gale, Mr. Harry male 38.0 1 0 28664 21.0000 NaN S Mr X
402 1294 1 Gibson, Miss. Dorothy Winifred female 22.0 0 1 112378 59.4000 NaN C Miss X
403 1295 1 Carrau, Mr. Jose Pedro male 17.0 0 0 113059 47.1000 NaN S Mr X
404 1296 1 Frauenthal, Mr. Isaac Gerald male 43.0 1 0 17765 27.7208 D40 C Mr D
405 1297 2 Nourney, Mr. Alfred (Baron von Drachstedt")" male 20.0 0 0 SC/PARIS 2166 13.8625 D38 C Mr D
406 1298 2 Ware, Mr. William Jeffery male 23.0 1 0 28666 10.5000 NaN S Mr X
407 1299 1 Widener, Mr. George Dunton male 50.0 1 1 113503 211.5000 C80 C Mr C
408 1300 3 Riordan, Miss. Johanna Hannah"" female NaN 0 0 334915 7.7208 NaN Q Miss X
409 1301 3 Peacock, Miss. Treasteall female 3.0 1 1 SOTON/O.Q. 3101315 13.7750 NaN S Miss X
410 1302 3 Naughton, Miss. Hannah female NaN 0 0 365237 7.7500 NaN Q Miss X
411 1303 1 Minahan, Mrs. William Edward (Lillian E Thorpe) female 37.0 1 0 19928 90.0000 C78 Q Mrs C
412 1304 3 Henriksson, Miss. Jenny Lovisa female 28.0 0 0 347086 7.7750 NaN S Miss X
413 1305 3 Spector, Mr. Woolf male NaN 0 0 A.5. 3236 8.0500 NaN S Mr X
414 1306 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 108.9000 C105 C Dona C
415 1307 3 Saether, Mr. Simon Sivertsen male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S Mr X
416 1308 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S Mr X
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C Master X

418 rows × 13 columns


In [35]:
dtrain = xgb.DMatrix(Xs, Ys)
param = {'max_depth':3, 'n_estimators': 300, 'learning_rate': 0.05, 'eta':1, 'silent':1, 'objective':'binary:logistic'}

#Watchlist does not affect model training.  It is simply a way to assess prediction error on an independent sample during the training process (that isn't used for training).
#The categorical variable in this particular dataset has already been booleaned but if that wasn't the case, One-Hot-Encoding must be applied to all categoricals (see Orchestra.jl).

                                                                                                                                               
# via http://motls.blogspot.in/2014/09/a-top-2-kaggle-higgs-solution.html
eta = 0.01 (small shrinkage)
max_depth = 9 (default max_depth=6 which is not deep enough)
sub_sample = 0.9 (giving some randomness for preventing overfitting)
num_rounds = 3000 (because of slow shrinkage, many trees are needed)

In [37]:
print ('running cross validation')
num_round = 20
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 1)


running cross validation
Out[37]:
test-error-mean test-error-std train-error-mean train-error-std
0 0.183146 0.026731 0.167416 0.005210
1 0.192135 0.028068 0.152809 0.007238
2 0.188764 0.035102 0.143258 0.006090
3 0.182022 0.033820 0.138764 0.006846
4 0.182022 0.032096 0.126404 0.004070
5 0.178651 0.022862 0.125843 0.006318
6 0.176404 0.024770 0.125562 0.005590
7 0.179775 0.023569 0.121629 0.005998
8 0.177528 0.025769 0.117135 0.007464
9 0.170786 0.019974 0.115450 0.005576
10 0.170786 0.019002 0.110112 0.005865
11 0.167415 0.017551 0.107023 0.007346
12 0.174157 0.026351 0.105056 0.005852
13 0.177528 0.022359 0.105337 0.005825
14 0.176404 0.024770 0.100843 0.005134
15 0.175281 0.024719 0.098315 0.008092
16 0.178651 0.030853 0.095225 0.004898
17 0.176404 0.024514 0.091292 0.005547
18 0.176404 0.026494 0.089045 0.004914
19 0.169663 0.028291 0.088202 0.004109

In [ ]: