In [13]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score

Read data


In [2]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
all_df = train_df.append(test_df)

all_df['is_test'] = all_df.Survived.isnull()
all_df.index = all_df.Survived
del all_df['Survived']

In [3]:
all_df.head()


Out[3]:
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Ticket is_test
Survived
0 22 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 A/5 21171 False
1 38 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 PC 17599 False
1 26 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 STON/O2. 3101282 False
1 35 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 113803 False
0 35 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 373450 False

Target variable


In [4]:
train_df.describe()


Out[4]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

Target variable is Survived.

Quality metric

Your score is the percentage of passengers you correctly predict. That means - accuracy.

Model

One variable model

Let's build a very simple model, based on one variable. That nobody will survived.


In [36]:
def select_features(df):
    non_obj_feats = df.columns[ df.dtypes != 'object' ]
    black_list = ['is_test']
    
    return [feat for feat in non_obj_feats if feat not in black_list ]

def get_X_y(df):
    feats = select_features(df)
    
    X = df[feats].values
    y = df.index.values.astype(int)
    
    return X, y

def check_quality(model, X, y, n_folds=5, random_state=0, shuffle=False):
    skf = StratifiedKFold(y, n_folds=n_folds, random_state=random_state, shuffle=shuffle)
    scores = []
    
    for train_index, test_index in skf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        
        scores.append(score)
        
    return np.mean(scores), np.std(scores)

def train_and_verify(all_df, model):
    X, y = get_X_y( all_df[ all_df.is_test == False ] )
    return check_quality(model, X, y)

In [5]:
class SingleVariableModel(BaseEstimator, ClassifierMixin):
    def __init__(self, seed=1):
        np.random.seed(seed)

    def fit(self, X, y):
        return self
        
    def predict(self, X):
        
        
        return [0] * len(X)
    
    def __repr__(self):
        return 'SingleVariableModel'

Run & evoluate single variable model


In [37]:
train_and_verify(all_df, SingleVariableModel())


Out[37]:
(0.61616490890978648, 0.0015536004208290756)

What do you think about this result?

Let's build more advanced model

Missing values

There're several methods how to manage missing values, let's fill out -1.


In [7]:
all_df.fillna(-1, inplace=True)

In [38]:
train_and_verify(all_df, RandomForestClassifier())


Out[38]:
(0.6836195074308804, 0.045102412780797671)

The result looks better than previous (0.616 vs 0.683).
Let's improve it... by using those features ['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']


In [ ]:


In [ ]: