In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
%matplotlib inline

In [ ]:
train = pd.read_csv("../data/train.csv")
test  = pd.read_csv("../data/test.csv")

In [13]:
# train = pd.read_csv('s3a://aws-s3-data/kaggle/titanic/train.csv')
# test  = pd.read_csv('s3a://aws-s3-data/kaggle/titanic/test.csv')

Exploratory Data Analysis


In [14]:
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 39.2+ KB

In [15]:
train.describe()


Out[15]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [ ]:
# train.Cabin.str.split().str.get(-1).str[0]
# train.Cabin.str.split(expand=True)

In [16]:
# train.Ticket.str.split().str.get(0).str.extract
train.Ticket.str.split()[0:].str[0].head()


Out[16]:
0         A/5
1          PC
2    STON/O2.
3      113803
4      373450
dtype: object

In [20]:
print train[train['Survived']==1]["Age"].mean(),
print train[train['Survived']==0]["Age"].mean(),
print test.Age.mean()


28.3436896552 30.6261792453 30.2725903614

Data Cleaning


In [21]:
def clean_data(titanic):

    titanic = titanic.copy()
    
    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
    titanic['Cabin'] = titanic['Cabin'].str.split().str.get(-1).str[0]
    titanic['Ticket'] = titanic.Ticket.str.split()[0:].str[0]
    
    titanic.loc[titanic["Sex"] == "male", "Sex"] = -10
    titanic.loc[titanic["Sex"] == "female", "Sex"] = 10
    
    titanic["Embarked"] = titanic["Embarked"].fillna("S")
    
    titanic['Title'] = titanic['Name'].apply(lambda x: x.split(',')[1].split()[0])

#     d = {'Mr.':'Mr', 'Mrs.':'Mrs', 'Miss.':'Miss', 'Master.':'Master', 'Don.':'Mr', 'Rev.':'Mr', 'Dr.':'Dr', 'Mme.':'Mrs',
#        'Ms.':'Miss', 'Major.':'Mr', 'Lady.':'Miss', 'Sir.':'Mr', 'Mlle.':'Miss', 'Col.':'Mr', 'Capt.':'Mr', 'the':'Mr',
#        'Jonkheer.':'Mr', 'Dona.':'Mrs'}
    
    d = {'Mr.':28, 'Mrs.':80, 'Miss.':50, 'Master.':28, 'Don.':40, 'Rev.':60, 'Dr.':60, 'Mme.':80,
       'Ms.':50, 'Major.':60, 'Lady.':70, 'Sir.':40, 'Mlle.':50, 'Col.':60, 'Capt.':60, 'the':28,
       'Jonkheer.':28, 'Dona.':70}

    titanic['Title'].replace(d, inplace =True)
    
    colnames = ['Embarked','Cabin','Ticket']
    for colname in colnames:
        titanic[colname] = pd.Categorical(titanic[colname]).codes

#     # Grab all the features that can be included in a Random Forest Regressor
#     age_titanic = titanic[['Age','Fare','Ticket','Pclass','Cabin','Title']]

#     # Split into sets with known and unknown Age values
#     knownAge = age_titanic.loc[ (titanic.Age.notnull()) ]
#     unknownAge = age_titanic.loc[ (titanic.Age.isnull()) ]
    
#     # All age values are stored in a target array
#     y = knownAge.pop('Age').values
  
#     # All the other values are stored in the feature array
#     X = knownAge.values
    
#     # Create and fit a model
#     rtr = RandomForestRegressor(20)
#     rtr.fit(X, y)
    
#     # Use the fitted model to predict the missing values
#     predictedAges = rtr.predict(unknownAge.values[:, 1::])
    
#     # Assign those predictions to the full data set
#     titanic.loc[ (titanic.Age.isnull()), 'Age' ] = predictedAges 
    
    
    # StandardScaler will subtract the mean from each value then scale to the unit variance
#     scaler = StandardScaler()
#     titanic['Age_scaled'] = scaler.fit_transform(titanic['Age'])
#     titanic['Fare_scaled'] = scaler.fit_transform(titanic['Fare'])
    
    titanic.Age = titanic.Age/titanic.Age.max()
    titanic.Fare = titanic.Fare/titanic.Fare.max()

    titanic['AgeSex'] = titanic.Age * titanic.Sex
    titanic['AgeSexFare'] = titanic.Age * titanic.Sex * titanic.Fare
#     titanic['TitlePclass'] = titanic.Title * titanic.Pclass
#     titanic['CabinPclass'] = titanic.Cabin * titanic.Pclass
#     titanic['PclassSq'] = titanic.Pclass ** 2
#     titanic['SexFare'] = titanic.Sex * titanic.Fare
#     titanic["FamilySize"] = titanic['Parch'] + titanic['SibSp']


#     titanic.loc[(titanic["Sex"] == "female") , "Age"] = \
#         titanic.loc[(titanic["Sex"] == "female") , "Age"].fillna(28.34)
#     titanic.loc[(titanic["Sex"] == "male") , "Age"] = \
#         titanic.loc[(titanic["Sex"] == "male") , "Age"].fillna(30.62)
#         (titanic[titanic['Survived']==0]["Age"].mean())    
    
#     titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 1
#     titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 2
#     titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 3
    
    titanic.drop(titanic[['Name', 
#                           'Ticket', 
#                           'Cabin',
#                           'Age',
#                           'Sex',
#                           'Fare',
                          'SibSp',
                          'Parch',
#                           'Title',
#                           'Pclass',
                         ]], axis = 1, inplace=True)
    
    return titanic

In [24]:
df = clean_data(train)
df_train = df.copy()
df_train.drop('PassengerId', axis=1, inplace=True)
df_test  = clean_data(test)

In [28]:
df.describe().T


Out[28]:
count mean std min 25% 50% 75% max
PassengerId 891 446.000000 257.353842 1.00000 223.50000 446.000000 668.500000 891
Survived 891 0.383838 0.486592 0.00000 0.00000 0.000000 1.000000 1
Pclass 891 2.308642 0.836071 1.00000 2.00000 3.000000 3.000000 3
Age 891 0.367020 0.162746 0.00525 0.27500 0.350000 0.437500 1
Ticket 891 322.291807 179.517825 0.00000 158.50000 337.000000 516.500000 556
Fare 891 0.062858 0.096995 0.00000 0.01544 0.028213 0.060508 1
Cabin 891 -0.221100 1.599664 -1.00000 -1.00000 -1.000000 -1.000000 7
Embarked 891 1.536476 0.791503 0.00000 1.00000 2.000000 2.000000 2
Title 891 40.641975 18.612814 28.00000 28.00000 28.000000 50.000000 80

In [26]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Sex           891 non-null object
Age           891 non-null float64
Ticket        891 non-null int16
Fare          891 non-null float64
Cabin         891 non-null int8
Embarked      891 non-null int8
Title         891 non-null int64
AgeSex        891 non-null object
AgeSexFare    891 non-null object
dtypes: float64(2), int16(1), int64(3), int8(2), object(3)
memory usage: 66.1+ KB

In [27]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
train[train['Survived']==1]["Age"].hist(bins=20, label='survived')
plt.title('Survived')
plt.subplot(1,2,2)
train[train['Survived']==0]["Age"].hist(bins=20)
plt.title('Did not survive')


Out[27]:
<matplotlib.text.Text at 0x11a239510>

In [29]:
df.head()


Out[29]:
PassengerId Survived Pclass Sex Age Ticket Fare Cabin Embarked Title AgeSex AgeSexFare
0 1 0 3 -10 0.2750 518 0.014151 -1 2 28 -2.75 -0.0389154
1 2 1 1 10 0.4750 532 0.139136 2 0 80 4.75 0.660895
2 3 1 3 10 0.3250 551 0.015469 -1 2 50 3.25 0.0502729
3 4 1 1 10 0.4375 49 0.103644 2 2 80 4.375 0.453444
4 5 0 3 -10 0.4375 472 0.015713 -1 2 28 -4.375 -0.0687424

Random Forest


In [30]:
y = df_train.pop('Survived').values
X = df_train.values
X_test = df_test.values

In [31]:
rf = RandomForestClassifier(40, n_jobs=-1)
rf.fit(X,y)


Out[31]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [32]:
feat_rank = np.argsort(rf.feature_importances_)[::-1]
feat_rank


Out[32]:
array([8, 9, 3, 4, 2, 1, 0, 7, 5, 6])

In [33]:
df_train.columns[feat_rank]


Out[33]:
Index([u'AgeSex', u'AgeSexFare', u'Ticket', u'Fare', u'Age', u'Sex', u'Pclass',
       u'Title', u'Cabin', u'Embarked'],
      dtype='object')

In [34]:
df_features = pd.DataFrame(rf.feature_importances_,df_train.columns, columns = ['feature_value'])

In [35]:
df_features.sort_values('feature_value', ascending=False)


Out[35]:
feature_value
AgeSex 0.233618
AgeSexFare 0.173260
Ticket 0.148060
Fare 0.112366
Age 0.078279
Sex 0.073964
Pclass 0.066506
Title 0.055003
Cabin 0.038744
Embarked 0.020201

In [36]:
scores = np.zeros((feat_rank.shape[0],2))
for i in range(1,feat_rank.shape[0]+1):
    features = [df_train.columns[feat_rank][x] for x in range(i)]
    scores[i-1:] = (i,(cross_val_score(rf, df[features], df['Survived'], cv=10)).mean())
scores


Out[36]:
array([[  1.        ,   0.77216661],
       [  2.        ,   0.76326495],
       [  3.        ,   0.82048292],
       [  4.        ,   0.82043241],
       [  5.        ,   0.83058365],
       [  6.        ,   0.8193335 ],
       [  7.        ,   0.82837334],
       [  8.        ,   0.83732522],
       [  9.        ,   0.83955964],
       [ 10.        ,   0.82499064]])

In [37]:
plt.plot(scores[:,:1],scores[:,1:2])


Out[37]:
[<matplotlib.lines.Line2D at 0x11adcc490>]

In [39]:
cross_val_score(rf, df[features], df['Survived'], cv=10).mean()


Out[39]:
0.83731273408239704

In [43]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(figsize=(12,5))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), df_train.columns[indices])
plt.xlim([-1, X.shape[1]])
plt.show()


Feature ranking:
1. feature 8 (0.213818)
2. feature 9 (0.190771)
3. feature 3 (0.150035)
4. feature 4 (0.117305)
5. feature 2 (0.082685)
6. feature 1 (0.077373)
7. feature 0 (0.061037)
8. feature 7 (0.048072)
9. feature 5 (0.038854)
10. feature 6 (0.020050)

In [38]:
features = [df_train.columns[feat_rank][x] for x in range(9)]
features


Out[38]:
['AgeSex',
 'AgeSexFare',
 'Ticket',
 'Fare',
 'Age',
 'Sex',
 'Pclass',
 'Title',
 'Cabin']

In [51]:
# features = [df_train.columns[indices][x] for x in range(9)]
# features

In [45]:
X = df_train[features].values
X


Out[45]:
array([[-2.75, -0.03891540829607214, 518, ..., 3, 28, -1],
       [4.75, 0.6608947430675433, 532, ..., 1, 80, 2],
       [3.25, 0.05027285190849946, 551, ..., 3, 50, -1],
       ..., 
       [3.5, 0.160199730954238, 553, ..., 3, 50, -1],
       [-3.25, -0.190307325836591, 8, ..., 1, 28, 2],
       [-4.0, -0.060507970265993034, 466, ..., 3, 28, -1]], dtype=object)

In [46]:
def create_submission(model, train, test, features, filename):

#     model.fit(train[features], train['Survived'])
    predictions = model.predict(test[features])

    submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
    
    submission.to_csv(filename, index=False)

In [47]:
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# build a classifier
clf = RandomForestClassifier()


# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 6),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              'n_estimators': [10, 40, 50, 60],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, n_jobs=-1)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)


RandomizedSearchCV took 4.30 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.831 (std: 0.008)
Parameters: {'bootstrap': True, 'min_samples_leaf': 7, 'n_estimators': 50, 'min_samples_split': 8, 'criterion': 'gini', 'max_features': 3, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.829 (std: 0.004)
Parameters: {'bootstrap': True, 'min_samples_leaf': 10, 'n_estimators': 60, 'min_samples_split': 1, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.824 (std: 0.025)
Parameters: {'bootstrap': False, 'min_samples_leaf': 10, 'n_estimators': 60, 'min_samples_split': 9, 'criterion': 'entropy', 'max_features': 4, 'max_depth': None}


In [50]:
# use a full grid over all parameters
param_grid = {'max_depth': [1, 2, 4, None],
              'max_features': ['sqrt', 'log2', None],
              'min_samples_split': [1, 2, 6, 8, 10],
              'min_samples_leaf': [1, 2, 4, 6],
              'bootstrap': [True, False],
              'n_estimators': [30, 40, 50, 60, 100],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)


GridSearchCV took 2313.00 seconds for 4800 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.846 (std: 0.022)
Parameters: {'bootstrap': True, 'min_samples_leaf': 2, 'n_estimators': 100, 'min_samples_split': 1, 'criterion': 'gini', 'max_features': None, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.846 (std: 0.022)
Parameters: {'bootstrap': True, 'min_samples_leaf': 2, 'n_estimators': 40, 'min_samples_split': 10, 'criterion': 'gini', 'max_features': None, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.846 (std: 0.016)
Parameters: {'bootstrap': True, 'min_samples_leaf': 4, 'n_estimators': 40, 'min_samples_split': 6, 'criterion': 'gini', 'max_features': None, 'max_depth': None}


In [49]:
grid_search.best_estimator_


Out[49]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=6,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [52]:
create_submission(grid_search.best_estimator_,
                        df, df_test, features, "../submissions/rf_submission.csv")

Random Forest Results

0.79426
['AgeSex', 'AgeSexFare', 'Fare', 'Sex', 'Pclass', 'Age']
create_submission(RandomForestClassifier(
                        bootstrap= True, 
                        min_samples_leaf= 3, 
                        n_estimators= 20, 
                        min_samples_split= 9, 
                        criterion= 'entropy', 
                        max_features= 4, 
                        max_depth= None)

0.78469
['AgeSex', 'AgeSexFare', 'Fare', 'Age', 'Pclass', 'Sex']
create_submission(RandomForestClassifier(50, min_samples_split=4, min_samples_leaf=2), \
                  df, df_test, predictors, "submission.csv")
0.76555
['AgeSex', 'AgeSexFare', 'Fare', 'Age']
create_submission(RandomForestClassifier(50, min_samples_split=4, min_samples_leaf=2), \
                  df, df_test, features, "submission.csv")

In [ ]:
trees_accuracy = []
for i in xrange(1,X.shape[1]):
    rf = RandomForestClassifier(50, max_features = i, min_samples_split=4, min_samples_leaf=2)
    rf.fit(X, y)
    trees_accuracy.append(rf.score(X,y))

In [ ]:
plt.plot(range(1, X.shape[1]), trees_accuracy, '-o')

SVM


In [ ]:
pipeline = Pipeline([('scaler', StandardScaler()), 
                     ('svc', SVC(kernel='linear'))])
pipeline.fit(X, y)

In [ ]:
parameters = {'kernel':['linear','rbf'], 
              'C':np.linspace(.001,10,5),'degree':np.linspace(0,10,5)}

gsCV = GridSearchCV(estimator=pipeline.steps[1][1],
                    param_grid=parameters,scoring='accuracy', cv=5)

In [ ]:
X = pipeline.steps[0][1].fit_transform(X)

In [ ]:
gsCV.fit(X,y)

In [ ]:
gsCV.grid_scores_, gsCV.best_params_

mean: 0.78151, std: 0.03323, params: {'C': 25.00075, 'degree': 0.0}


In [ ]:
def svm_submission(model, train, test, features, filename):

    model.fit(train[features], train['Survived'])
    predictions = model.predict(test[features])

    submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
    
    submission.to_csv(filename, index=False)

In [ ]:
svm_features = [df_train.columns[feat_rank][x] for x in range(8)]
svm_features

In [ ]:
create_submission(Pipeline([('scaler', StandardScaler()),
                    ('svc', SVC(kernel='rbf', C=2.5, degree=2.5))]), \
                  df, df_test, svm_features, "../submissions/svm_submission.csv")

Gradient Boosting


In [ ]:
X = df_train
X.head()

In [ ]:
gdb = GradientBoostingClassifier(
                n_estimators=3000,
                learning_rate = 0.01, 
                max_depth = 4,
                max_features = 0.1,
                min_samples_leaf = 17)
gdb.fit(X,y)

In [ ]:
feat_rank = np.argsort(gdb.feature_importances_)[::-1]
feat_rank
df_train.columns[feat_rank]

In [ ]:
boost_features = [df_train.columns[feat_rank][x] for x in range(8)]
boost_features

In [ ]:
df_train[boost_features].head()

In [ ]:
X = df_train[boost_features]
X.head()

In [ ]:
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6],
              'min_samples_leaf': [3, 5, 9, 17],
              'max_features': [1.0, 0.3, 0.1]}
gdb_grid = GradientBoostingClassifier(n_estimators=6000)
gs_cv = GridSearchCV(gdb_grid, param_grid).fit(X,y)

gs_cv.best_params_

In [ ]:
gs_cv.grid_scores_
BEST PARAMS
{'learning_rate': 0.01,
 'max_depth': 4,
 'max_features': 0.1,
 'min_samples_leaf': 17}

In [ ]:
create_submission(GradientBoostingClassifier(
                n_estimators=3000,
                learning_rate = 0.01, 
                max_depth = 4,
                max_features = 0.1,
                min_samples_leaf = 9),
                df, df_test, boost_features, "../submissions/gdboost_submission.csv")

Adaptive Boosting


In [ ]:
X = df_train
X.head()

In [ ]:
ada = AdaBoostClassifier(
                n_estimators=3000,
                learning_rate = 0.01)
ada.fit(X,y)

In [ ]:
feat_rank = np.argsort(ada.feature_importances_)[::-1]
ada_features = [df_train.columns[feat_rank][x] for x in range(6)]
ada_features

In [ ]:
X = df_train[ada_features]
X.head()

In [ ]:
param_grid = {'learning_rate': [1, 0.1, 0.05, 0.02, 0.01]}

ada_grid = AdaBoostClassifier(n_estimators=6000)
ada_cv = GridSearchCV(ada_grid, param_grid).fit(X,y)

ada_cv.best_params_

In [ ]:
create_submission(AdaBoostClassifier(
                n_estimators=3000,
                learning_rate = 0.01),
                df, df_test, ada_features, "../submissions/adaboost_submission.csv")

In [ ]: