In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook

#load the files
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

#size of training dataset
train_samples = train.shape[0]

In [2]:
train.head(10)


Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C

In [3]:
print("Missing values:")
for f in train.columns:
    if pd.concat((train,test))[f].isnull().any():
        print("- {}: {:.1f}%".format(f, 100 * pd.concat((train,test))[f].isnull().sum()/len(pd.concat((train,test)))))


Missing values:
- Survived: 31.9%
- Age: 20.1%
- Fare: 0.1%
- Cabin: 77.5%
- Embarked: 0.2%

In [4]:
train.Embarked.unique()


Out[4]:
array(['S', 'C', 'Q', nan], dtype=object)

In [5]:
train.groupby(by='Survived').PassengerId.count()

# double number of not survived that survived


Out[5]:
Survived
0    549
1    342
Name: PassengerId, dtype: int64

In [6]:
train.Name.apply(lambda s: s.split(". ")[0].split(", ")[1]  ).unique()


Out[6]:
array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def scale(X, fit_scaler=False):
    # Input is a dataframe
    #
    # Note the way of scaling (df[df.columns])
    # we want to mantain the dataframe (instead of numpy array)
    if fit_scaler:
        X[X.columns] = scaler.fit_transform(X[X.columns])
    else:
        X[X.columns] = scaler.transform(X[X.columns])
        
    return X 

def preprocess(df):
    X = df[['Pclass','Sex']].copy()

    # feature engineering
    X.Sex = X.Sex.map({'female':1, 'male':0})
    X['Family'] = df.Parch + df.Parch
    
    return X

In [8]:
# plotting a scatter matrix
def plot_matrix(X_train, y_train):
    colormap = {0:'firebrick',1:'steelblue'}
    colors = np.vectorize(colormap.get)(y_train)

    pd.plotting.scatter_matrix(X_train, c=colors, marker = 'o', s=30,
                               hist_kwds={'bins':15}, figsize=(9,9));

In [9]:
def save_to_file(clf, X_test):
    import os

    predictions = clf.predict(X_test)

    passengerId = 892
    file = "PassengerId,Survived" + os.linesep

    for i in range(len(X_test)):
        file += "{},{}".format(passengerId, (int)(predictions[i]))  + os.linesep
        passengerId += 1

    # Save to file
    with open('attempt.txt', 'w') as f:
        f.write(file)

In [10]:
#baseline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

def baseline(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

    dummy = DummyClassifier(random_state=0)

    #be aware of y as column vector
    dummy.fit(X_train, y_train.values.reshape(-1))
    acc = dummy.score(X_val.values, y_val.values.reshape(-1))
    print('Accuracy: {:.2f}\n'.format(acc))
    
    # Combined report with all above metrics
    print(classification_report(y_val, dummy.predict(X_val), target_names=['Not Survived', 'Survived']))

Baseline starts from 0.54


In [11]:
X_train = preprocess(train)
y_train = train[['Survived']]

baseline(X_train, y_train)


Accuracy: 0.54

              precision    recall  f1-score   support

Not Survived       0.63      0.62      0.63       139
    Survived       0.39      0.40      0.40        84

 avg / total       0.54      0.54      0.54       223



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score

def check_model(X, y):
    rfc = RandomForestClassifier(random_state=0)
    scores = cross_validate(rfc, X, y, cv=10, scoring='accuracy')

    print("Train scores: {:.3f}".format(scores['train_score'].mean()))
    print("Test scores: {:.3f}".format(scores['test_score'].mean()))
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
    rfc.fit(X_train, y_train)
    
    y_pred = rfc.predict(X_val)
    print("Accuracy: {:.3f}".format(accuracy_score(y_val, y_pred)))

    y_probs = rfc.predict_proba(X_val)
    auc = roc_auc_score(y_val, y_probs[:,1])
    print("AUC:{:.3f}".format(auc))
    
    print(classification_report(y_val, y_pred, target_names=['Not Survived', 'Survived']))
    
    return rfc

In [477]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score

def grid_search(X, y, test_size=0.25):
    max_range = np.arange(3, X.shape[1]+1, 5)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size , random_state=0)

    '''
    Best params:{'min_samples_leaf': 3, 
                 'n_estimators': 50, 
                 'bootstrap': True, 
                 'max_features': 'sqrt', 
                 'max_depth': 8, 
                 'min_samples_split': 13, 
                 'class_weight': 'balanced'}
    '''
    params = {
             'n_estimators': [50, 100, 150],
             'max_features': ['sqrt'],
             'max_depth' : [8, 10, 12, 50],
             'class_weight': ['balanced', {1:2}],
             'min_samples_split': [5, 10, 13, 15],
             'min_samples_leaf': [1, 3, 5, 7],
             'bootstrap': [True, False],
             }
    
    params = {'n_estimators': [150, 200, 300, 500], 
              'bootstrap': [True], 
              'class_weight': [{1:2}],
              'max_depth' : [25],
              'max_features': ['sqrt']}

    rfc = RandomForestClassifier(n_jobs=2, random_state=0)
    
    grid_rfc = GridSearchCV(rfc, param_grid=params, cv=10, n_jobs=2, scoring='accuracy')
    grid_rfc.fit(X_train, y_train)

    best_rfc = grid_rfc.best_estimator_

    y_pred = best_rfc.predict(X_val)
    print("Accuracy: {:.3f}".format(accuracy_score(y_val, y_pred)))

    y_probs = best_rfc.predict_proba(X_val)
    auc = roc_auc_score(y_val, y_probs[:,1])
    print("AUC:{:.3f}".format(auc))
    
    print("Best params: {}\n".format(grid_rfc.best_params_))
    print(classification_report(y_val, y_pred, target_names=['Not Survived', 'Survived']))
    
    return best_rfc

In [478]:
X_train = preprocess(train).values
y_train = train[['Survived']].values.reshape(-1)

rfc = check_model(X_train, y_train)


Train scores: 0.807
Test scores: 0.795
Accuracy: 0.794
AUC:0.867
              precision    recall  f1-score   support

Not Survived       0.83      0.85      0.84       139
    Survived       0.74      0.70      0.72        84

 avg / total       0.79      0.79      0.79       223


In [479]:
X_test = preprocess(test)
save_to_file(rfc, X_test)

0.77511 in Kaggle!!!



In [480]:
def preprocess_2(df):
    X = df[['Pclass','Sex']].copy()

    # feature engineering
    X.Sex = X.Sex.map({'female':1, 'male':0})  
    X['Family'] = (df.SibSp*df.Parch)/(df.SibSp + df.Parch + 0.0001)
    
    X['Age'] = df.Age.fillna(df.Age.median())
    group_pclass_fare = df.groupby(by='Pclass').Fare.median()
    X['Fare'] = np.where(df.Fare.isnull(), group_pclass_fare[df.Pclass], df.Fare)
    
    return X

In [481]:
X_train = preprocess_2(train)
rfc = check_model(X_train, y_train)
rfc = grid_search(X_train, y_train, test_size=0.1)


Train scores: 0.965
Test scores: 0.816
Accuracy: 0.852
AUC:0.862
              precision    recall  f1-score   support

Not Survived       0.86      0.91      0.89       139
    Survived       0.84      0.75      0.79        84

 avg / total       0.85      0.85      0.85       223

Accuracy: 0.833
AUC:0.879
Best params: {'bootstrap': True, 'n_estimators': 300, 'class_weight': {1: 2}, 'max_depth': 25, 'max_features': 'sqrt'}

              precision    recall  f1-score   support

Not Survived       0.81      0.92      0.86        51
    Survived       0.88      0.72      0.79        39

 avg / total       0.84      0.83      0.83        90


In [482]:
X_test = preprocess_2(test)
save_to_file(rfc, X_test)

Kaggle 0.74641


Cleaning and Feature Engeneering


In [483]:
def process_sex(data):
    data.Sex = data.Sex.map({'female':0, 'male':1})
    return data

def process_embarked(data):
    #fill with most common
    most_common = data['Embarked'].value_counts().index[0]
    data.Embarked = data.Embarked.fillna(most_common)
    #U of unknown
    #data.Embarked = data.Embarked.fillna('U')
    #data.Embarked = data.Embarked.map({'S':0,'C':1,'Q':2,'U':3})
    dummies = pd.get_dummies(data.Embarked, prefix='Embarked')
    data = pd.concat([data, dummies], axis=1)
    return data

def process_family(data):
    data['Family'] = data.SibSp + data.Parch   
    
    def getFamilySize(num):
        if num == 0:
            return 'alone'
        elif num <= 2:
            return 'small'
        elif num == 3:
            return 'medium'
        else:
            return 'large'
        
    data['FamilySize'] = data.Family.apply(getFamilySize) 
    dummies = pd.get_dummies(data.FamilySize, prefix='FamilySize')
    data = pd.concat([data, dummies], axis=1).drop('FamilySize', axis=1)
    
    return data


def process_name(data):
    dict_names = {
                "Capt":       "Officer",
                "Col":        "Officer",
                "Major":      "Officer",
                "Jonkheer":   "Royalty",
                "Don":        "Royalty",
                "Sir" :       "Royalty",
                "Dr":         "Officer",
                "Rev":        "Officer",
                "the Countess":"Royalty",
                "Dona":       "Royalty",
                "Mme":        "Mrs",
                "Mlle":       "Miss",
                "Ms":         "Mrs",
                "Mr" :        "Mr",
                "Mrs" :       "Mrs",
                "Miss" :      "Miss",
                "Master" :    "Master",
                "Lady" :      "Royalty"
                }
    
    data['Name'] = data.Name.apply(lambda s: s.split(". ")[0].split(", ")[1])
    data.Name = data.Name.map(dict_names)
    dummies = pd.get_dummies(data.Name, prefix='Name')
    data = pd.concat([data, dummies], axis=1)
    return data

def process_age(data):
    grouped_name = data.groupby(by=['Sex','Pclass','Name']).Age.median()

    data.Age = data.apply(lambda r: grouped_name[r.Sex, r.Pclass, r.Name] if np.isnan(r.Age) else r.Age, axis=1)
    
    #Just in case there is no median() por Sex-Pclass-Name
    if(data.Age.isnull().any()):
        grouped_name_2 = data.groupby(by=['Sex','Pclass']).Age.median()
        data.Age = data.apply(lambda r: grouped_name_2[r.Sex, r.Pclass] if np.isnan(r.Age) else r.Age, axis=1)
        print('Age from only 2 clasess')
        
    return data

def process_fare(data):
    group_pclass_fare = data.groupby(by='Pclass').Fare.median()
    data.Fare = np.where(data.Fare.isnull(), group_pclass_fare[data.Pclass], data.Fare)
    return data
    
def process_cabin(data):
    data['Deck'] = data.Cabin.str[0]
    data.loc[data.Deck.isnull(), 'Deck'] = 'U' #unknown
    #data.Deck = data.Deck.map({'NaN':0, 'F':1, 'E':2, 'C':3, 'D':4, 'B':5, 'G':6, 'A':7, 'T':8})
    dummies = pd.get_dummies(data.Deck, prefix='Deck')
    data = pd.concat([data, dummies], axis=1)
    
    #data['Room'] = np.where(data.Cabin.isnull(), 999, data.Cabin.str.split().str.get(0).str[1:])
    #data.Room = pd.to_numeric(data.Room)
    #data.loc[data.Room.isnull(), 'Room'] = 999
    
    return data


def process_ticket(data):
    #data['TicketNumber'] = data.Ticket.str.extractall("(.*\s)?(.+)")[1]
    data['TicketNumber'] = data.Ticket.str.extract("(.*\s)?(.+)", expand=True)[1]
    data['TicketCode'] = data.Ticket.str.extract("(.*\s)?(.+)", expand=True)[0]
    data.TicketCode = data.TicketCode.fillna('NAN')
    dummies = pd.get_dummies(data.TicketCode, prefix='TicketCode')
    data = pd.concat([data, dummies], axis=1)
    #special case LINE
    data.TicketNumber.replace('LINE', '0', inplace=True)
    data.TicketNumber = data.TicketNumber.astype('int64')
    dummies = pd.get_dummies(data.TicketNumber, prefix='TicketNumber')
    data = pd.concat([data, dummies], axis=1)
    
    data["TicketGroupSize"] = data.groupby('Ticket')['Ticket'].transform('count')
    
    return data

def process_pclass(data):
    dummies = pd.get_dummies(data.Pclass, prefix='Pclass')
    data = pd.concat([data, dummies], axis=1)
    return data

def process(data):
    data = process_sex(data)
    data = process_embarked(data)
    data = process_family(data)
    data = process_name(data)
    data = process_age(data)
    data = process_fare(data)
    data = process_cabin(data)
    data = process_ticket(data)
    data = process_pclass(data)
    
    doNotInclude = ['PassengerId','Name','Pclass','Cabin','Deck','Ticket','Embarked','TicketCode','TicketNumber']
    data = data.drop(doNotInclude, axis=1)
    return data

In [484]:
#concat for auto generated dummie features from categorical
data = pd.concat([train,test])
data = process(data)

processed_train = data[:train_samples]
processed_test = data[train_samples:]

In [485]:
X_train = processed_train.drop('Survived', axis=1).values
y_train = processed_train[['Survived']].values.ravel()

check_model(X_train, y_train)


Train scores: 0.982
Test scores: 0.813
Accuracy: 0.861
AUC:0.891
              precision    recall  f1-score   support

Not Survived       0.86      0.92      0.89       139
    Survived       0.85      0.76      0.81        84

 avg / total       0.86      0.86      0.86       223

Out[485]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [486]:
clf = grid_search(X_train, y_train)


Accuracy: 0.852
AUC:0.900
Best params: {'bootstrap': True, 'n_estimators': 200, 'class_weight': {1: 2}, 'max_depth': 25, 'max_features': 'sqrt'}

              precision    recall  f1-score   support

Not Survived       0.88      0.88      0.88       139
    Survived       0.80      0.81      0.80        84

 avg / total       0.85      0.85      0.85       223


In [487]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, clf.predict(X_train))


Out[487]:
array([[530,  19],
       [ 38, 304]])

In [488]:
X_test = processed_test.drop('Survived', axis=1)
save_to_file(clf, X_test)

In [489]:
def CHECK():
    import pandas as pd
    from sklearn.metrics import accuracy_score

    other = pd.read_csv('attempt_79904.txt')
    mine = pd.read_csv('attempt.txt')

    data = pd.merge(other,mine, on='PassengerId')
    acc= accuracy_score(data.Survived_x, data.Survived_y)
    
    print("Acc: {}".format(acc))

CHECK()


Acc: 0.9401913875598086

In [ ]: