Titanic: Machine Learning from Disaster

This example is pandas-ml transration based on kaggle's "Getting Started With Python II". To run this Jupyter Notebook, you must download data to titanic_data directory.

https://www.kaggle.com/c/titanic/details/getting-started-with-python-ii


In [1]:
import os

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 6)

In [2]:
import pandas_ml as pdml

In [3]:
train_df = pd.read_csv(os.path.join('titanic_data', 'train.csv'), header=0, index_col=0)
test_df = pd.read_csv(os.path.join('titanic_data', 'test.csv'), header=0, index_col=0)

# convert to pdml.ModelFrame
train_df = pdml.ModelFrame(train_df, target='Survived')
test_df = pdml.ModelFrame(test_df)
train_df


Out[3]:
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
890 1 1 Behr, Mr. Karl Howell male 26 0 0 111369 30.0000 C148 C
891 0 3 Dooley, Mr. Patrick male 32 0 0 370376 7.7500 NaN Q

891 rows × 11 columns


In [4]:
# display target columns
train_df.target


Out[4]:
PassengerId
1      0
2      1
3      1
      ..
889    0
890    1
891    0
Name: Survived, dtype: int64

In [5]:
# display data columns
train_df.data


Out[5]:
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
3 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
... ... ... ... ... ... ... ... ... ... ...
889 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
890 1 Behr, Mr. Karl Howell male 26 0 0 111369 30.0000 C148 C
891 3 Dooley, Mr. Patrick male 32 0 0 370376 7.7500 NaN Q

891 rows × 10 columns


In [6]:
train_df['Gender'] = train_df['Sex'].replace({'female': 0, 'male': 1})
train_df['Gender']


Out[6]:
PassengerId
1      1
2      0
3      0
      ..
889    0
890    1
891    1
Name: Gender, dtype: int64

In [7]:
if train_df['Embarked'].isnull().any():
    train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].value_counts().argmax())
                                                       
ports_factor, ports = train_df['Embarked'].factorize()
ports_dict = dict(zip(ports, np.arange(len(ports))))
ports_dict


Out[7]:
{'C': 1, 'Q': 2, 'S': 0}

In [8]:
train_df['Embarked'] = train_df['Embarked'].replace(ports_dict)

In [9]:
if train_df['Age'].isnull().any():
    train_df.loc[ (train_df.Age.isnull()), 'Age'] = train_df['Age'].median()

train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin'], axis=1) 
train_df


Out[9]:
Survived Pclass Age SibSp Parch Fare Embarked Gender
PassengerId
1 0 3 22 1 0 7.2500 0 1
2 1 1 38 1 0 71.2833 1 0
3 1 3 26 0 0 7.9250 0 0
... ... ... ... ... ... ... ... ...
889 0 3 28 1 2 23.4500 0 0
890 1 1 26 0 0 30.0000 1 1
891 0 3 32 0 0 7.7500 2 1

891 rows × 8 columns


In [10]:
test_df['Gender'] = test_df['Sex'].replace({'female': 0, 'male': 1})

if test_df['Embarked'].isnull().any() > 0:
    test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].value_counts().argmax())
    
test_df['Embarked'] = test_df['Embarked'].replace(ports_dict)

if test_df['Age'].isnull().any():
    test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

if test_df['Fare'].isnull().any():
    median_fare = np.zeros(3)
    for f in range(0,3):                                           
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].median()
    for f in range(0,3):                                        
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]

test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin'], axis=1) 
test_df


Out[10]:
Pclass Age SibSp Parch Fare Embarked Gender
PassengerId
892 3 34.5 0 0 7.8292 2 1
893 3 47.0 1 0 7.0000 0 0
894 2 62.0 0 0 9.6875 2 1
... ... ... ... ... ... ... ...
1307 3 38.5 0 0 7.2500 0 1
1308 3 27.0 0 0 8.0500 0 1
1309 3 27.0 1 1 22.3583 1 1

418 rows × 7 columns


In [11]:
# Training...
forest = train_df.ensemble.RandomForestClassifier(n_estimators=100)
forest = train_df.fit(forest)
forest


Out[11]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
# Predicting...
predicted = test_df.predict(forest)
predicted


Out[12]:
PassengerId
892     0
893     0
894     0
       ..
1307    0
1308    0
1309    1
dtype: int64

In [13]:
predicted.reset_index()


Out[13]:
PassengerId 0
0 892 0
1 893 0
2 894 0
... ... ...
415 1307 0
416 1308 0
417 1309 1

418 rows × 2 columns


In [14]:
predicted.value_counts()


Out[14]:
0    261
1    157
dtype: int64