In [20]:
# TODO: read about feature selection / combination in pipelines
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# http://scikit-learn.org/stable/auto_examples/feature_stacker.html
# http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html
# https://github.com/paulgb/sklearn-pandas
In [66]:
# global settings
CORES = -1 # parallelization
RND = 123 # random seed
In [67]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
sns.set(style="white", palette="muted", color_codes=True)
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import pipeline
from sklearn import metrics
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import tree
from sklearn import cross_validation
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
In [35]:
def fetch(fname,
drop=['PassengerId', 'Name', 'Ticket', 'Cabin'],
add_derived=True):
"""
add_title : extract title (e.g., Mr, Ms, Mrs, etc) from passanger name
"""
drop = drop or []
df = pd.read_csv(fname)
# optional: extract title from name
if add_derived and 'Name' in df.columns:
df['Title'] = (df['Name']
.apply(lambda x: x.split(',')[1].split()[0] if ',' in x else np.Nan)
.apply(lambda x: x[:-1] if x.endswith('.') else x))
if add_derived and 'Cabin' in df.columns:
df['Deck'] = df['Cabin'].str[:1].fillna('X')
if add_derived and 'Ticket' in df.columns:
df['TicketNum'] = df.Ticket.str.isnumeric().astype(np.float64)
df_train.Ticket.str.isnumeric()
drop = [col for col in drop if col in df.columns]
df = df.drop(drop, axis=1)
return df
In [36]:
# data
df = df_train = fetch('titanic/train.csv', drop=None)
df_comp = fetch('titanic/test.csv', drop=None)
df.drop('Name', axis=1).head(5)
Out[36]:
In [6]:
sns.pairplot(df_train, x_vars=['Age','Fare'], y_vars='Survived', size=7, aspect=0.7)
Out[6]:
In [7]:
sns.pairplot(df_train, x_vars=['Age'], y_vars='Fare', size=7, aspect=0.7)
Out[7]:
In [8]:
# Set up the matplotlib figure
f, axes = plt.subplots(2, figsize=(7, 7), sharex=False)
sns.despine(left=True)
sns.distplot(df_train['Fare'].dropna(), hist=True, rug=True, color="b", ax=axes[0])
sns.distplot(df_train['Age'].dropna(), hist=True, rug=True, color="r", ax=axes[1])
Out[8]:
In [9]:
X, y = df_train.ix[:,1:], df_train.Survived
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
df_train.ix[:,1:], df_train.Survived, test_size=0.4, random_state=RND
)
kf_total = KFold(len(X), n_folds=10, shuffle=True, random_state=RND)
X.drop('Name', axis=1).head(5)
Out[9]:
In [8]:
mapper = DataFrameMapper([
('Sex', preprocessing.LabelBinarizer()),
('Title', preprocessing.LabelBinarizer()),
(['Age'], [preprocessing.Imputer(), preprocessing.StandardScaler()]),
(['Fare'], [preprocessing.Imputer(),preprocessing.StandardScaler()]),
(['SibSp'], preprocessing.Imputer(strategy='most_frequent')),
(['Pclass'], preprocessing.Imputer(strategy='most_frequent')),
(['Parch'], preprocessing.Imputer(strategy='most_frequent'))
])
# demo:
mapper.fit_transform(df_train.copy())[0]
Out[8]:
In [10]:
# alternative feature mapper inspired by eyebervil's feature selection
# things to consider:
# - convert titles to Mr. / Mrs.
# - define FamilySize = SibSp + Parch + 1
# - exploit cabin information
# - impute age via regression (how can this be structured in DataFrameMapper?)
# prepare a vectorized method that will standardize titles to one of: Mr, Mm, Ms
tmap = {}
tmap.update({k: 'Mr' for k in ("Mr", "Capt", "Master", "Col", "Don", "Jonkheer", "Major", "Rev", "Sir")})
tmap.update({k: 'Mm' for k in ("Mrs", "Dona", "Lady", "Mme", "Lady", "the Countess")})
tmap.update({k: 'Ms' for k in ("Mlle", "Ms", "Miss")})
lup = dict(male=tmap.copy(), female=tmap.copy())
lup['male']['Dr'] = 'Mr'
lup['female']['Dr'] = 'Mm'
lup_default = 'Mm'
def standardize_title(F):
title, sex = F
res = lup.get(sex, {}).get(title, lup_default)
#print lup.get(sex, {})
#print title, sex, res
return res
def vectorized_standardize_title(F):
return np.apply_along_axis(standardize_title, 1, F)
std_title = vectorized_standardize_title
def compute_family_size(F):
return np.sum(F[:,0:2], axis=1)+1
mapper2 = DataFrameMapper([
('Sex', preprocessing.LabelBinarizer()),
(['Age'], [preprocessing.Imputer(), preprocessing.StandardScaler()]),
(['Fare'], [preprocessing.Imputer(),preprocessing.StandardScaler()]),
(['Pclass'], preprocessing.Imputer(strategy='most_frequent')),
# title group
(['Title', 'Sex'], [preprocessing.FunctionTransformer(
std_title,
validate=False
),
preprocessing.LabelBinarizer()]),
# family size
(['SibSp', 'Parch'], preprocessing.FunctionTransformer(compute_family_size)),
# cabin information: check for presence
#('Cabin', preprocessing.FunctionTransformer(lambda F: (pd.isnull(F)).astype(np.float64), validate=False)),
# deck extracted from cabin
('Deck', preprocessing.LabelBinarizer())
])
# demo
mapper2.fit_transform(df_train.copy())[0:1]
Out[10]:
In [44]:
# mapper for tree-based regressions
mapper3 = DataFrameMapper([
('Sex', preprocessing.LabelEncoder()),
('Title', preprocessing.LabelEncoder()),
(['Age'], [preprocessing.Imputer(), preprocessing.StandardScaler()]),
(['Fare'], [preprocessing.Imputer(),preprocessing.StandardScaler()]),
(['SibSp'], preprocessing.Imputer(strategy='most_frequent')),
(['Pclass'], preprocessing.Imputer(strategy='most_frequent')),
(['Parch'], preprocessing.Imputer(strategy='most_frequent')),
('Deck', preprocessing.LabelEncoder()), # Titanic deck as extracted from the Cabin
('TicketNum', None) # fully numeric ticket, or with a value?
])
# demo:
mapper3.fit_transform(df_train.copy())[0:2]
Out[44]:
In [11]:
model = dict()
In [12]:
cls = 'rf'
mod = model[cls] = dict()
mod['classifier'] = ensemble.RandomForestClassifier(n_estimators=100, oob_score=True, n_jobs=-1, random_state=RND)
mod['params'] = dict(model__n_estimators=[100], model__random_state=[RND])
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)
In [77]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [16]:
cls='svm'
mod = model[cls] = dict()
mod['classifier'] = svm.SVC(random_state=RND)
mod['params'] = dict(model__kernel=['rbf', 'linear'], model__random_state=[RND])
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)
In [322]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))
print("Accuracy: %0.3f" % (model['svm']['model'].score(df_train.copy(), df_train.Survived)))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [332]:
cls='svm2'
mod = model[cls] = dict()
mod['classifier'] = svm.SVC(random_state=RND)
mod['params'] = dict(model__kernel=['rbf', 'linear', 'poly'],
model__C=[0.1, 1.0, 10.0],
model__random_state=[RND])
mod['pipe'] = pipeline.Pipeline([('featurize', mapper2), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)
In [333]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))
print("Accuracy: %0.3f" % (model['svm']['model'].score(df_train.copy(), df_train.Survived)))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [17]:
cls='svm3'
mod = model[cls] = dict()
mod['classifier'] = svm.SVC(random_state=RND)
mod['params'] = dict(model__kernel=['linear'], model__random_state=[RND],
model__C=[0.1, 1.0, 10.0],)
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)
In [19]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [319]:
cls='adaboost'
mod = model[cls] = dict()
mod['classifier'] = ensemble.AdaBoostClassifier(random_state=RND)
mod['params'] = {'model__learning_rate': [0.3, 1.0, 1.3],
'model__base_estimator': [tree.DecisionTreeClassifier()]}
mod['pipe'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)
In [320]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
scores = cross_val_score(mod['model'], df_train.copy(), df_train.Survived, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Accuracy: %0.3f" % (model['svm']['model'].score(df_train.copy(), df_train.Survived)))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
# OK this is clearly overfitted...
In [292]:
cls = 'logit'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper), ('model', mod['classifier'])])
In [294]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.3f (+/- %0.3f)" % (
np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [255]:
cls = 'logit2'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper2), ('model', mod['classifier'])])
In [277]:
mod = model['logit2']
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.3f (+/- %0.3f)" % (
np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [272]:
cls = 'logpca'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper),
('pca', decomposition.PCA(n_components=10)),
('logit', mod['classifier'])])
In [278]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.3f (+/- %0.3f)" % (
np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [268]:
cls = 'logpol'
mod = model[cls] = dict()
mod['classifier'] = linear_model.LogisticRegressionCV(Cs=20, cv=10, n_jobs=-1, random_state=RND)
mod['model'] = pipeline.Pipeline([('featurize', mapper),
('polynomial', preprocessing.PolynomialFeatures(interaction_only=True,
degree=2)),
('model', mod['classifier'])])
In [271]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
print("Accuracy: %0.2f (+/- %0.2f)" % (
np.mean([max(sc) for sc in mod['classifier'].scores_[1]]),
np.std([max(sc) for sc in mod['classifier'].scores_[1]])
))
print metrics.confusion_matrix(df_train.Survived, mod['model'].predict(df_train.copy()))
mod['pred'].to_csv('titanic/submissions/titanic_{}.csv'.format(cls), header=True, index=False)
In [50]:
Y_train = df_train.Survived.as_matrix()
X_train = mapper3.fit_transform(df_train.copy())
dtrain = xgb.DMatrix(X_train, label=Y_train)
In [ ]:
cls = 'xgb1'
mod = model[cls] = dict()
mod['classifier'] = xgb.XGBClassifier()
mod['params'] = {'model__max_depth': [2, 4, 6],
'model__n_estimators': [50, 100, 200]}
mod['pipe'] = pipeline.Pipeline([('featurize', mapper3), ('model', mod['classifier'])])
mod['model'] = GridSearchCV(mod['pipe'], mod['params'], cv=10, n_jobs=-1)
In [ ]:
mod['model'].fit(df_train.copy(), df_train.Survived)
mod['pred'] = pd.DataFrame({'PassengerId': df_comp.PassengerId,
'Survived' : mod['model'].predict(df_comp.copy())})
In [ ]:
print(mod['pipe'].best_score_)
print(mod['pipe'].best_params_)
In [73]:
kf = cross_validation.KFold(Y_train.shape[0], n_folds=2, shuffle=True, random_state=RND)
for train_index, test_index in kf:
xgb_model = xgb.XGBClassifier().fit(X_train[train_index],Y_train[train_index])
predictions = xgb_model.predict(X_train[test_index])
actuals = Y_train[test_index]
print(metrics.confusion_matrix(actuals, predictions))
In [71]:
predictions
Out[71]:
In [64]:
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic',
'nthread': 4}
num_round = 2
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 1)
Out[64]:
In [46]:
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=1000, learning_rate=0.05).fit(Xs, Ys)
predictions = gbm.predict(mapper.fit_transform(df_comp.copy()))
metrics.accuracy_score(Ys, gbm.predict(Xs))
In [280]:
# TODO: this was trained on full sample!
if False:
# https://github.com/justmarkham/DAT7/blob/master/notebooks/12_advanced_model_evaluation.ipynb
# predict probability of survival
y_pred_prob = pipe.predict_proba(df_train)[:, 1]
# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(df_train.Survived, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 0.3])
plt.ylim([0.7, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')