In [238]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display
%matplotlib inline

train_input = pd.read_csv("../input/train.csv", dtype={"Age": np.float64})
test_input = pd.read_csv("../input/test.csv", dtype={"Age": np.float64})

df = pd.concat([train_input, test_input], ignore_index=True)
df.head()


Out[238]:
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket
0 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 21171
1 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 17599
2 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 3101282
3 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 113803
4 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450

In [239]:
print(df.hist())


[[<matplotlib.axes._subplots.AxesSubplot object at 0x1285A310>
  <matplotlib.axes._subplots.AxesSubplot object at 0x24FD6430>
  <matplotlib.axes._subplots.AxesSubplot object at 0x16B13B30>]
 [<matplotlib.axes._subplots.AxesSubplot object at 0x159D3210>
  <matplotlib.axes._subplots.AxesSubplot object at 0x18427990>
  <matplotlib.axes._subplots.AxesSubplot object at 0x1FFF28D0>]
 [<matplotlib.axes._subplots.AxesSubplot object at 0x26AC11F0>
  <matplotlib.axes._subplots.AxesSubplot object at 0x1827F8F0>
  <matplotlib.axes._subplots.AxesSubplot object at 0x26F67050>]]

In [240]:
categorical_columns = ['Sex', 'Embarked']
numerical_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
text_columns = ['Name', 'Ticket']

def category_to_numeric(df, column_name):
    for category in df[column_name].unique():
        category_column = column_name + '_' + str(category)
        if category_column in df.columns:
            df = df.drop(category_column, axis=1)
        if category_column not in numerical_columns:
            numerical_columns.append(category_column)
    df= pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
    return df

In [241]:
print(df.hist())


[[<matplotlib.axes._subplots.AxesSubplot object at 0x12ADF410>
  <matplotlib.axes._subplots.AxesSubplot object at 0x2709CBB0>
  <matplotlib.axes._subplots.AxesSubplot object at 0x264F6350>]
 [<matplotlib.axes._subplots.AxesSubplot object at 0x2652A630>
  <matplotlib.axes._subplots.AxesSubplot object at 0x28163E10>
  <matplotlib.axes._subplots.AxesSubplot object at 0x26537A30>]
 [<matplotlib.axes._subplots.AxesSubplot object at 0x28F43DB0>
  <matplotlib.axes._subplots.AxesSubplot object at 0x28F8E4D0>
  <matplotlib.axes._subplots.AxesSubplot object at 0x28D537D0>]]

In [242]:
# Sex
sns.set(style="whitegrid")

g = sns.factorplot(x="Sex", y="Survived", data=df, size=4, palette="muted")
g.despine(left=True)
g.set_ylabels("survival probability")


Out[242]:
<seaborn.axisgrid.FacetGrid at 0x2001f090>

In [243]:
def get_sex_adult(row):
    age, sex = row
    if age < 18:
        return 'child'
    elif sex == 'female':
        return 'female_adult'
    else:
        return 'male_adult'

df['SexAdult'] = df[['Age', 'Sex']].apply(get_sex_adult, axis=1)
g = sns.factorplot(x="SexAdult", y="Survived", data=df, size=4, palette="muted")

if 'SexAdult' not in categorical_columns:
    categorical_columns.append('SexAdult')



In [244]:
# Embarked
df['Embarked'] = df['Embarked'].fillna('unknown')
if 'Embarked' not in categorical_columns:
    categorical_columns.append('Embarked')

df["Embarked_Category"] = pd.Categorical.from_array(df.Embarked).codes
if 'Embarked_Category' not in categorical_columns:
    categorical_columns.append('Embarked_Category')

g = sns.factorplot(x="Embarked_Category", y="Survived", data=df, size=4, palette="muted")
g.despine(left=True)
g.set_ylabels("survival probability")


Out[244]:
<seaborn.axisgrid.FacetGrid at 0x25c49e50>

In [245]:
df_ticket = pd.DataFrame(df['Ticket'].value_counts())
df_ticket.rename(columns={'Ticket':'TicketMembers'}, inplace=True)

df_ticket['Ticket_perishing_women'] = df.Ticket[(df.SexAdult == 'female_adult')
                                               & (df.Survived == 0.0)
                                               & ((df.Parch > 0) | (df.SibSp > 0))].value_counts()
df_ticket['Ticket_perishing_women'] = df_ticket['Ticket_perishing_women'].fillna(0)
df_ticket['TicketGroup_include_perishing_women'] = df_ticket['Ticket_perishing_women'] > 0
df_ticket['TicketGroup_include_perishing_women'] = df_ticket['TicketGroup_include_perishing_women'].astype(int)

df_ticket['Ticket_surviving_men'] = df.Ticket[(df.SexAdult == 'male_adult')
                                              & (df.Survived == 1.0)
                                              & ((df.Parch > 0) | (df.SibSp > 0))].value_counts()
df_ticket['Ticket_surviving_men'] = df_ticket['Ticket_surviving_men'].fillna(0)
df_ticket['TicketGroup_include_surviving_men'] = df_ticket['Ticket_surviving_men'] > 0
df_ticket['TicketGroup_include_surviving_men'] = df_ticket['TicketGroup_include_surviving_men'].astype(int)

df_ticket["TicketId"] = pd.Categorical.from_array(df_ticket.index).codes
df_ticket.loc[df_ticket[df_ticket['TicketMembers'] < 3].index, "TicketId"] = -1
df_ticket["TicketMembers_Simple"] = pd.cut(df_ticket['TicketMembers'], bins=[0,1,4,20], labels=[0,1,2])
if 'TicketGroup_include_perishing_women' not in df.columns:
    df = pd.merge(df, df_ticket, left_on="Ticket", right_index=True, how='left', sort=False)

if 'Ticket_perishing_women' not in numerical_columns:
    numerical_columns.append('Ticket_perishing_women')
if 'TicketGroup_include_perishing_women' not in numerical_columns:
    numerical_columns.append('TicketGroup_include_perishing_women')
if 'Ticket_surviving_men' not in numerical_columns:
    numerical_columns.append('Ticket_surviving_men')
if 'TicketGroup_include_surviving_men' not in numerical_columns:
    numerical_columns.append('TicketGroup_include_surviving_men')
if 'TicketId' not in numerical_columns:
    numerical_columns.append('TicketId')
if 'TicketMembers' not in numerical_columns:
    numerical_columns.append('TicketMembers')
    
g = sns.factorplot(x="TicketGroup_include_perishing_women", y="Survived", data=df, size=4, palette="muted")
g = sns.factorplot(x="Ticket_surviving_men", y="Survived", data=df, size=4, palette="muted")



In [246]:
# surname
df['surname'] = df['Name'].apply(lambda x: x.split(',')[0].lower())
df_surname = pd.DataFrame(df['surname'].value_counts())
df_surname.rename(columns={'surname':'SurnameMembers'}, inplace=True)

df_surname['Surname_perishing_women'] = df.surname[(df.SexAdult == 'female_adult')
                                               & (df.Survived == 0.0)
                                               & ((df.Parch > 0) | (df.SibSp > 0))].value_counts()
df_surname['Surname_perishing_women'] = df_surname['Surname_perishing_women'].fillna(0)
df_surname['SurnameGroup_include_perishing_women'] = df_surname['Surname_perishing_women'] > 0
df_surname['SurnameGroup_include_perishing_women'] = df_surname['SurnameGroup_include_perishing_women'].astype(int)

df_surname['Surname_surviving_men'] = df.surname[(df.SexAdult == 'male_adult')
                                              & (df.Survived == 1.0)
                                              & ((df.Parch > 0) | (df.SibSp > 0))].value_counts()
df_surname['Surname_surviving_men'] = df_surname['Surname_surviving_men'].fillna(0)
df_surname['SurnameGroup_include_surviving_men'] = df_surname['Surname_surviving_men'] > 0
df_surname['SurnameGroup_include_surviving_men'] = df_surname['SurnameGroup_include_surviving_men'].astype(int)

df_surname["SurnameId"] = pd.Categorical.from_array(df_surname.index).codes
df_surname.loc[df_surname[df_surname['SurnameMembers'] < 3].index, "SurnameId"] = -1
df_surname["SurnameMembers_Simple"] = pd.cut(df_surname['SurnameMembers'], bins=[0,1,4,20], labels=[0,1,2])
if 'SurnameGroup_include_perishing_women' not in df.columns:
    df = pd.merge(df, df_surname, left_on="surname", right_index=True, how='left', sort=False)


if 'Surname_perishing_women' not in numerical_columns:
    numerical_columns.append('Surname_perishing_women')
if 'SurnameGroup_include_perishing_women' not in numerical_columns:
    numerical_columns.append('SurnameGroup_include_perishing_women')
if 'Surname_surviving_men' not in numerical_columns:
    numerical_columns.append('Surname_surviving_men')
if 'SurnameGroup_include_surviving_men' not in numerical_columns:
    numerical_columns.append('SurnameGroup_include_surviving_men')
if 'SurnameId' not in numerical_columns:
    numerical_columns.append('SurnameId')
if 'SurnameMembers' not in numerical_columns:
    numerical_columns.append('SurnameMembers')
    
g = sns.factorplot(x="SurnameGroup_include_perishing_women", y="Survived", data=df, size=4, palette="muted")
g = sns.factorplot(x="SurnameGroup_include_surviving_men", y="Survived", data=df, size=4, palette="muted")



In [247]:
# title
import re
df['Name_title'] = df['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
df.loc[df[df['Name_title'] == 'Ms'].index, 'Name_title'] = 'Miss'
print(df['Name_title'].unique())
if 'Name_title' not in categorical_columns:
    categorical_columns.append('Name_title')
g = sns.factorplot(y="Name_title", x="Survived", data=df, size=4, palette="muted")


['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Major' 'Lady' 'Sir'
 'Mlle' 'Col' 'Capt' 'Countess' 'Jonkheer' 'Dona']

In [248]:
title_mapping = {
    "Mr": 1, 
    "Miss": 2, 
    "Ms": 2, 
    "Mlle": 2, 
    "Mrs": 3, 
    "Mme": 3,
    "Master": 4, 
    "Dr": 5, 
    "Rev": 6, 
    "Major": 7, 
    "Capt": 7,
    "Col": 7, 
    "Don": 9,
    "Dona": 9, 
    "Sir": 9, 
    "Lady": 10, 
    "Countess": 10, 
    "Jonkheer": 10, 
}
df["Name_titleCategory"] = df.loc[:,'Name_title'].map(title_mapping)

if 'Name_titleCategory' not in categorical_columns:
    categorical_columns.append('Name_titleCategory')
g = sns.factorplot(x="Name_titleCategory", y="Survived", data=df, size=4, palette="muted")



In [249]:
# FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch']
if 'FamilySize' not in numerical_columns:
    numerical_columns.append('FamilySize')
g = sns.factorplot(x="SibSp", y="Survived", data=df, size=4, palette="muted")
g = sns.factorplot(x="Parch", y="Survived", data=df, size=4, palette="muted")
g = sns.factorplot(x="FamilySize", y="Survived", data=df, size=4, palette="muted")



In [250]:
# Name Length?
df['NameLength'] = df["Name"].apply(lambda x: len(x))
if 'NameLength' not in numerical_columns:
    numerical_columns.append('NameLength')
g = sns.factorplot(y="NameLength", x="Survived", data=df, size=4, palette="muted")
g.despine(left=True)
g.set_ylabels("survival probability")


Out[250]:
<seaborn.axisgrid.FacetGrid at 0x2650fb90>

In [251]:
# Pclass
g = sns.factorplot(x="Pclass", y="Survived", data=df, size=4, palette="muted")



In [252]:
# cabin
# https://www.kaggle.com/c/titanic/prospector#1326
def get_cabin_location(cabin):
    if cabin == ' ':
        return 'no_cabin'
    # The cabin info consists of a letter (corresponding to a deck) 
    # and a cabin number, which is odd for cabins on the starboard side and even for the port.
    cabin_search_result = re.search('\d+', cabin)
    if cabin_search_result:
        type_code = np.int64(cabin_search_result.group(0))
        if type_code % 2 == 0:
            return 'port'
        else:
            return 'starboard'
    return 'unknown'

def get_cabin_deck(cabin):
    if cabin == ' ':
        return 'no_cabin'
    # The cabin info consists of a letter (corresponding to a deck) 
    # and a cabin number, which is odd for cabins on the starboard side and even for the port.
    cabin_search_result = re.search('[A-z]+', cabin)
    if cabin_search_result:
        return cabin_search_result.group(0)
    return 'unknown'

def get_cabin_count(cabin):
    if cabin == ' ':
        return 0
    cabin_search_result = re.findall('([A-z]\d+)', cabin)
    if cabin_search_result:
        return len(cabin_search_result)
    return 0

df['CabinLocation'] = df['Cabin'].fillna(' ').apply(get_cabin_location)
df['CabinDeck'] = df['Cabin'].fillna(' ').apply(get_cabin_deck)
df['CabinCount'] = df['Cabin'].fillna(' ').apply(get_cabin_count)

if 'CabinLocation' not in categorical_columns:
    categorical_columns.append('CabinLocation')
if 'CabinDeck' not in categorical_columns:
    categorical_columns.append('CabinDeck')
if 'CabinCount' not in numerical_columns:
    numerical_columns.append('CabinCount')

g = sns.factorplot(x="Survived", y="CabinLocation", data=df, size=4, palette="muted")
g = sns.factorplot(x="Survived", y="CabinDeck", data=df, size=4, palette="muted")
g = sns.factorplot(x="CabinCount", y="Survived", data=df, size=4, palette="muted")



In [253]:
df['CabinCategory'] = pd.Categorical.from_array(df.Cabin.fillna('0').apply(lambda x:x[0])).codes
g = sns.factorplot(y="Survived", x="CabinCategory", data=df, size=4, palette="muted")
if 'CabinCategory' not in categorical_columns:
    categorical_columns.append('CabinCategory')



In [254]:
# Fare
# df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df["Fare"] = df["Fare"].fillna(8.05)
print(df['Fare'].describe())
print(df['Fare'].hist())
g = sns.factorplot(x="Survived", y="Fare", data=df, size=4, palette="muted")


count    1309.000000
mean       33.276193
std        51.743584
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: Fare, dtype: float64
Axes(0.125,0.125;0.775x0.775)

In [255]:
df['TicketMembers'] = df['TicketMembers'].fillna(0)
print(df.head()[['Pclass','Fare', 'TicketMembers']])
df['Fare_per_ticket_member'] = df['Fare'] / (df['TicketMembers'])
print(df['Fare_per_ticket_member'].hist())
g = sns.factorplot(x="Survived", y="Fare_per_ticket_member", data=df, size=4, palette="muted")


   Pclass     Fare  TicketMembers
0       3   7.2500              1
1       1  71.2833              2
2       3   7.9250              1
3       1  53.1000              2
4       3   8.0500              1
Axes(0.125,0.125;0.775x0.775)

In [256]:
from math import log

class_fare = pd.DataFrame(columns=['count','mean','std','min','25%','50%','75%','max'])
class_fare.loc[1,:] = df[df['Pclass'] == 1]['Fare'].describe()
class_fare.loc[2,:] = df[df['Pclass'] == 2]['Fare'].describe()
class_fare.loc[3,:] = df[df['Pclass'] == 3]['Fare'].describe()

very_small_val = 0.01
df['Fare_standard_score_with_Pclass'] = df.apply(lambda row: (log(row['Fare'] + very_small_val) - log(class_fare.loc[row['Pclass'], 'mean'] + very_small_val)) / log(class_fare.loc[row['Pclass'], 'std'] + very_small_val), axis=1)
if 'Fare_standard_score_with_Pclass' not in numerical_columns:
    numerical_columns.append('Fare_standard_score_with_Pclass')

In [257]:
df[(df['Fare_standard_score_with_Pclass'] >= -0.5) & (df['Fare_standard_score_with_Pclass'] <= 0.5)]['Fare_standard_score_with_Pclass'].hist()
g = sns.factorplot(x="Survived", y="Fare_standard_score_with_Pclass", data=df, size=4, palette="muted")



In [258]:
from math import log

class_fare = pd.DataFrame(columns=['count','mean','std','min','25%','50%','75%','max'])
class_fare.loc[1,:] = df[df['Pclass'] == 1]['Fare_per_ticket_member'].describe()
class_fare.loc[2,:] = df[df['Pclass'] == 2]['Fare_per_ticket_member'].describe()
class_fare.loc[3,:] = df[df['Pclass'] == 3]['Fare_per_ticket_member'].describe()

very_small_val = 0.01
df['Fare_per_ticket_member_standard_score_with_Pclass'] = df.apply(lambda row: (log(row['Fare_per_ticket_member'] + very_small_val) - log(class_fare.loc[row['Pclass'], 'mean'] + very_small_val)) / log(class_fare.loc[row['Pclass'], 'std'] + very_small_val), axis=1)
if 'Fare_per_ticket_member_standard_score_with_Pclass' not in numerical_columns:
    numerical_columns.append('Fare_per_ticket_member_standard_score_with_Pclass')

In [259]:
df[(df['Fare_per_ticket_member_standard_score_with_Pclass'] >= -0.5) & (df['Fare_per_ticket_member_standard_score_with_Pclass'] <= 0.5)]['Fare_per_ticket_member_standard_score_with_Pclass'].hist()
g = sns.factorplot(x="Survived", y="Fare_per_ticket_member_standard_score_with_Pclass", data=df, size=4, palette="muted")



In [260]:
# https://www.kaggle.com/c/titanic/forums/t/11127/do-ticket-numbers-mean-anything
#print(df["Ticket"])
#print(df["Ticket"].value_counts())

def get_ticket_prefix(cabin):
    # The cabin info consists of a letter (corresponding to a deck) 
    # and a cabin number, which is odd for cabins on the starboard side and even for the port.
    cabin_search_result = re.search('[^\d]+', cabin)
    if cabin_search_result:
        return cabin_search_result.group(0).replace('/', '').replace('.', '').strip()
    return 'unknown'

df['TicketPrefix'] = df['Ticket'].apply(get_ticket_prefix)
g = sns.factorplot(y="TicketPrefix", x="Survived", data=df, size=8, palette="muted")

if 'TicketPrefix' not in categorical_columns:
    categorical_columns.append('TicketPrefix')



In [261]:
for col in categorical_columns:
    df = category_to_numeric(df, col)

In [262]:
# age prediction
from sklearn.ensemble import ExtraTreesRegressor

age_prediction_features = ['Fare', 'Fare_standard_score_with_Pclass',
                           #'Fare_per_ticket_member', 'Fare_per_ticket_member_standard_score_with_Pclass',
                           'Parch', 'Pclass', 'SibSp', 'Sex_female', 'Sex_male', 'FamilySize',
                           'NameLength', 'TicketMembers', 'TicketId', 
                           'Embarked_S', 'Embarked_C', 'Embarked_Q', 'Embarked_unknown', 
                           'Name_title_Mr', 'Name_title_Mrs', 'Name_title_Miss', 'Name_title_Master', 
                           'Name_title_Don', 'Name_title_Rev', 'Name_title_Dr', 'Name_title_Mme', 
                           'Name_title_Major', 'Name_title_Lady', 'Name_title_Sir', 'Name_title_Mlle', 'Name_title_Col', 
                           'Name_title_Capt', 'Name_title_Countess', 'Name_title_Jonkheer', 
                           'CabinLocation_no_cabin', 'CabinLocation_starboard', 'CabinLocation_port', 'CabinDeck_no_cabin', 
                           'CabinDeck_C', 'CabinDeck_E', 'CabinDeck_G', 'CabinDeck_D', 'CabinDeck_A', 'CabinDeck_B', 'CabinDeck_F', 'CabinDeck_T'
                          ]
age_prediction_tree_regressor = ExtraTreesRegressor(n_estimators=200)
age_X_train = df[age_prediction_features][df['Age'].notnull()]
age_Y_train = df['Age'][df['Age'].notnull()]
age_prediction_tree_regressor.fit(age_X_train, np.ravel(age_Y_train))

# predict only isnull values
df['Age_pred'] = df['Age']
df.loc[df[df['Age'].isnull()].index, 'Age_pred'] = age_prediction_tree_regressor.predict(df[age_prediction_features][df['Age'].isnull()])

if 'Age_pred' not in numerical_columns:
    numerical_columns.append('Age_pred')

# add ageGroup
df["AgeGroup"] = pd.cut(df['Age'], bins=[-2000,0,11,15,18,30,49,59,200], labels=[-1, 11,15,18,30,49,59,200])
df["AgeGroup_pred"] = pd.cut(df['Age_pred'], bins=[-2000,11,15,18,30,49,59,200], labels=[11,15,18,30,49,59,200])

if 'AgeGroup' not in numerical_columns:
    numerical_columns.append('AgeGroup')
if 'AgeGroup_pred' not in numerical_columns:
    numerical_columns.append('AgeGroup_pred')
    
g = sns.factorplot(y="Survived", x="AgeGroup", data=df, size=4, palette="muted")
g = sns.factorplot(y="Survived", x="AgeGroup_pred", data=df, size=4, palette="muted")



In [263]:
# Frugal_First_Class_Single_Man
# midle age first class single man with large discounted and unknown prefixed ticket and without cabin.
print("died", df[(df['Survived'] == 0) & (df['Sex'] == 'male') 
         & (df['Pclass'] == 1) 
         & (df['Age_pred'] <= 45) 
         & (df['Fare'] > 0)
         & (df['Fare_standard_score_with_Pclass'] < -0.25)
         & (df['TicketPrefix_unknown'] == 1)
         & (df['TicketMembers_Simple'] == 0)
         & (df['CabinCount'] == 0)
        ])
print("survived", df[(df['Survived'] == 1) & (df['Sex'] == 'male') 
         & (df['Pclass'] == 1) 
         & (df['Age_pred'] <= 45) 
         & (df['Fare'] > 0)
         & (df['Fare_standard_score_with_Pclass'] < -0.25)
         & (df['TicketPrefix_unknown'] == 1)
         & (df['TicketMembers_Simple'] == 0)
         & (df['CabinCount'] == 0)
        ])


('died', Empty DataFrame
Columns: [Age, Cabin, Embarked, Fare, Name, Parch, PassengerId, Pclass, Sex, SibSp, Survived, Ticket, SexAdult, Embarked_Category, TicketMembers, Ticket_perishing_women, TicketGroup_include_perishing_women, Ticket_surviving_men, TicketGroup_include_surviving_men, TicketId, TicketMembers_Simple, surname, SurnameMembers, Surname_perishing_women, SurnameGroup_include_perishing_women, Surname_surviving_men, SurnameGroup_include_surviving_men, SurnameId, SurnameMembers_Simple, Name_title, Name_titleCategory, FamilySize, NameLength, CabinLocation, CabinDeck, CabinCount, CabinCategory, Fare_per_ticket_member, Fare_standard_score_with_Pclass, Fare_per_ticket_member_standard_score_with_Pclass, TicketPrefix, Sex_female, Sex_male, Embarked_C, Embarked_Q, Embarked_S, Embarked_unknown, SexAdult_child, SexAdult_female_adult, SexAdult_male_adult, Embarked_Category_0, Embarked_Category_1, Embarked_Category_2, Embarked_Category_3, Name_title_Capt, Name_title_Col, Name_title_Countess, Name_title_Don, Name_title_Dona, Name_title_Dr, Name_title_Jonkheer, Name_title_Lady, Name_title_Major, Name_title_Master, Name_title_Miss, Name_title_Mlle, Name_title_Mme, Name_title_Mr, Name_title_Mrs, Name_title_Rev, Name_title_Sir, Name_titleCategory_1, Name_titleCategory_2, Name_titleCategory_3, Name_titleCategory_4, Name_titleCategory_5, Name_titleCategory_6, Name_titleCategory_7, Name_titleCategory_9, Name_titleCategory_10, CabinLocation_no_cabin, CabinLocation_port, CabinLocation_starboard, CabinLocation_unknown, CabinDeck_A, CabinDeck_B, CabinDeck_C, CabinDeck_D, CabinDeck_E, CabinDeck_F, CabinDeck_G, CabinDeck_T, CabinDeck_no_cabin, CabinCategory_0, CabinCategory_1, CabinCategory_2, CabinCategory_3, CabinCategory_4, CabinCategory_5, CabinCategory_6, ...]
Index: []

[0 rows x 138 columns])
('survived',       Age Cabin Embarked   Fare  \
187  45.0   NaN        S  26.55   
447  34.0   NaN        S  26.55   
507   NaN   NaN        S  26.55   
604  35.0   NaN        C  26.55   

                                              Name  Parch  PassengerId  \
187  Romaine, Mr. Charles Hallace ("Mr C Rolmane")      0          188   
447                    Seward, Mr. Frederic Kimber      0          448   
507  Bradley, Mr. George ("George Arthur Brayton")      0          508   
604                Homer, Mr. Harry ("Mr E Haven")      0          605   

     Pclass   Sex  SibSp  Survived  Ticket    SexAdult  Embarked_Category  \
187       1  male      0       1.0  111428  male_adult                  2   
447       1  male      0       1.0  113794  male_adult                  2   
507       1  male      0       1.0  111427  male_adult                  2   
604       1  male      0       1.0  111426  male_adult                  0   

     TicketMembers  Ticket_perishing_women  \
187              1                     0.0   
447              1                     0.0   
507              1                     0.0   
604              1                     0.0   

     TicketGroup_include_perishing_women  Ticket_surviving_men  \
187                                    0                   0.0   
447                                    0                   0.0   
507                                    0                   0.0   
604                                    0                   0.0   

     TicketGroup_include_surviving_men  TicketId  TicketMembers_Simple  \
187                                  0        -1                     0   
447                                  0        -1                     0   
507                                  0        -1                     0   
604                                  0        -1                     0   

     surname  SurnameMembers  Surname_perishing_women  \
187  romaine               1                      0.0   
447   seward               1                      0.0   
507  bradley               2                      0.0   
604    homer               1                      0.0   

     SurnameGroup_include_perishing_women  Surname_surviving_men  \
187                                     0                    0.0   
447                                     0                    0.0   
507                                     0                    0.0   
604                                     0                    0.0   

     SurnameGroup_include_surviving_men  SurnameId  SurnameMembers_Simple  \
187                                   0         -1                      0   
447                                   0         -1                      0   
507                                   0         -1                      1   
604                                   0         -1                      0   

    Name_title  Name_titleCategory  FamilySize  NameLength CabinLocation  \
187         Mr                   1           0          45      no_cabin   
447         Mr                   1           0          27      no_cabin   
507         Mr                   1           0          45      no_cabin   
604         Mr                   1           0          31      no_cabin   

    CabinDeck  CabinCount  CabinCategory  Fare_per_ticket_member  \
187  no_cabin           0              0                   26.55   
447  no_cabin           0              0                   26.55   
507  no_cabin           0              0                   26.55   
604  no_cabin           0              0                   26.55   

     Fare_standard_score_with_Pclass  \
187                        -0.271769   
447                        -0.271769   
507                        -0.271769   
604                        -0.271769   

     Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix  \
187                                          -0.089502      unknown   
447                                          -0.089502      unknown   
507                                          -0.089502      unknown   
604                                          -0.089502      unknown   

     Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  \
187         0.0       1.0         0.0         0.0         1.0   
447         0.0       1.0         0.0         0.0         1.0   
507         0.0       1.0         0.0         0.0         1.0   
604         0.0       1.0         1.0         0.0         0.0   

     Embarked_unknown  SexAdult_child  SexAdult_female_adult  \
187               0.0             0.0                    0.0   
447               0.0             0.0                    0.0   
507               0.0             0.0                    0.0   
604               0.0             0.0                    0.0   

     SexAdult_male_adult      ...        CabinDeck_E  CabinDeck_F  \
187                  1.0      ...                0.0          0.0   
447                  1.0      ...                0.0          0.0   
507                  1.0      ...                0.0          0.0   
604                  1.0      ...                0.0          0.0   

     CabinDeck_G  CabinDeck_T  CabinDeck_no_cabin  CabinCategory_0  \
187          0.0          0.0                 1.0              1.0   
447          0.0          0.0                 1.0              1.0   
507          0.0          0.0                 1.0              1.0   
604          0.0          0.0                 1.0              1.0   

     CabinCategory_1  CabinCategory_2  CabinCategory_3  CabinCategory_4  \
187              0.0              0.0              0.0              0.0   
447              0.0              0.0              0.0              0.0   
507              0.0              0.0              0.0              0.0   
604              0.0              0.0              0.0              0.0   

     CabinCategory_5  CabinCategory_6  CabinCategory_7  CabinCategory_8  \
187              0.0              0.0              0.0              0.0   
447              0.0              0.0              0.0              0.0   
507              0.0              0.0              0.0              0.0   
604              0.0              0.0              0.0              0.0   

     TicketPrefix_A  TicketPrefix_AQ  TicketPrefix_AS  TicketPrefix_C  \
187             0.0              0.0              0.0             0.0   
447             0.0              0.0              0.0             0.0   
507             0.0              0.0              0.0             0.0   
604             0.0              0.0              0.0             0.0   

     TicketPrefix_CA  TicketPrefix_CASOTON  TicketPrefix_FC  TicketPrefix_FCC  \
187              0.0                   0.0              0.0               0.0   
447              0.0                   0.0              0.0               0.0   
507              0.0                   0.0              0.0               0.0   
604              0.0                   0.0              0.0               0.0   

     TicketPrefix_Fa  TicketPrefix_LINE  TicketPrefix_LP  TicketPrefix_PC  \
187              0.0                0.0              0.0              0.0   
447              0.0                0.0              0.0              0.0   
507              0.0                0.0              0.0              0.0   
604              0.0                0.0              0.0              0.0   

     TicketPrefix_PP  TicketPrefix_PPP  TicketPrefix_SC  TicketPrefix_SCA  \
187              0.0               0.0              0.0               0.0   
447              0.0               0.0              0.0               0.0   
507              0.0               0.0              0.0               0.0   
604              0.0               0.0              0.0               0.0   

     TicketPrefix_SCAH  TicketPrefix_SCAH Basle  TicketPrefix_SCOW  \
187                0.0                      0.0                0.0   
447                0.0                      0.0                0.0   
507                0.0                      0.0                0.0   
604                0.0                      0.0                0.0   

     TicketPrefix_SCPARIS  TicketPrefix_SCParis  TicketPrefix_SOC  \
187                   0.0                   0.0               0.0   
447                   0.0                   0.0               0.0   
507                   0.0                   0.0               0.0   
604                   0.0                   0.0               0.0   

     TicketPrefix_SOP  TicketPrefix_SOPP  TicketPrefix_SOTONO  \
187               0.0                0.0                  0.0   
447               0.0                0.0                  0.0   
507               0.0                0.0                  0.0   
604               0.0                0.0                  0.0   

     TicketPrefix_SOTONOQ  TicketPrefix_SP  TicketPrefix_STONO  \
187                   0.0              0.0                 0.0   
447                   0.0              0.0                 0.0   
507                   0.0              0.0                 0.0   
604                   0.0              0.0                 0.0   

     TicketPrefix_STONOQ  TicketPrefix_SWPP  TicketPrefix_WC  \
187                  0.0                0.0              0.0   
447                  0.0                0.0              0.0   
507                  0.0                0.0              0.0   
604                  0.0                0.0              0.0   

     TicketPrefix_WEP  TicketPrefix_unknown  Age_pred  AgeGroup  AgeGroup_pred  
187               0.0                   1.0      45.0      49.0             49  
447               0.0                   1.0      34.0      49.0             49  
507               0.0                   1.0      45.0       NaN             49  
604               0.0                   1.0      35.0      49.0             49  

[4 rows x 138 columns])

In [264]:
df['Frugal_First_Class_Single_Man'] = 0

df.loc[df[(df['Sex'] == 'male') 
         & (df['CabinCount'] > 0)
         & (df['Embarked'] == 'C')
         & (df['SurnameMembers'] == 1)
         & (df['TicketPrefix_unknown'] == 1.0)
         & (df['Fare_standard_score_with_Pclass'] < -0.23)
         & (df['Pclass'] == 1)]['Frugal_First_Class_Single_Man'].index, 'Frugal_First_Class_Single_Man'] = 1
display(df[(df['Frugal_First_Class_Single_Man'] == 1)])
if 'Frugal_First_Class_Single_Man' not in numerical_columns:
    numerical_columns.append('Frugal_First_Class_Single_Man')


Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket SexAdult Embarked_Category TicketMembers Ticket_perishing_women TicketGroup_include_perishing_women Ticket_surviving_men TicketGroup_include_surviving_men TicketId TicketMembers_Simple surname SurnameMembers Surname_perishing_women SurnameGroup_include_perishing_women Surname_surviving_men SurnameGroup_include_surviving_men SurnameId SurnameMembers_Simple Name_title Name_titleCategory FamilySize NameLength CabinLocation CabinDeck CabinCount CabinCategory Fare_per_ticket_member Fare_standard_score_with_Pclass Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix Sex_female Sex_male Embarked_C Embarked_Q Embarked_S Embarked_unknown SexAdult_child SexAdult_female_adult SexAdult_male_adult ... CabinDeck_F CabinDeck_G CabinDeck_T CabinDeck_no_cabin CabinCategory_0 CabinCategory_1 CabinCategory_2 CabinCategory_3 CabinCategory_4 CabinCategory_5 CabinCategory_6 CabinCategory_7 CabinCategory_8 TicketPrefix_A TicketPrefix_AQ TicketPrefix_AS TicketPrefix_C TicketPrefix_CA TicketPrefix_CASOTON TicketPrefix_FC TicketPrefix_FCC TicketPrefix_Fa TicketPrefix_LINE TicketPrefix_LP TicketPrefix_PC TicketPrefix_PP TicketPrefix_PPP TicketPrefix_SC TicketPrefix_SCA TicketPrefix_SCAH TicketPrefix_SCAH Basle TicketPrefix_SCOW TicketPrefix_SCPARIS TicketPrefix_SCParis TicketPrefix_SOC TicketPrefix_SOP TicketPrefix_SOPP TicketPrefix_SOTONO TicketPrefix_SOTONOQ TicketPrefix_SP TicketPrefix_STONO TicketPrefix_STONOQ TicketPrefix_SWPP TicketPrefix_WC TicketPrefix_WEP TicketPrefix_unknown Age_pred AgeGroup AgeGroup_pred Frugal_First_Class_Single_Man
209 40.0 A31 C 31.0000 Blank, Mr. Henry 0 210 1 male 0 1.0 112277 male_adult 0 1 0.0 0 0.0 0 -1 0 blank 1 0.0 0 0.0 0 -1 0 Mr 1 0 16 starboard A 1 1 31.0000 -0.236466 -0.032824 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 40.00 49.0 49 1
452 30.0 C111 C 27.7500 Foreman, Mr. Benjamin Laventall 0 453 1 male 0 0.0 113051 male_adult 0 1 0.0 0 0.0 0 -1 0 foreman 1 0.0 0 0.0 0 -1 0 Mr 1 0 31 starboard C 1 3 27.7500 -0.261698 -0.073333 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 30.00 30.0 30 1
487 58.0 B37 C 29.7000 Kent, Mr. Edward Austin 0 488 1 male 0 0.0 11771 male_adult 0 1 0.0 0 0.0 0 -1 0 kent 1 0.0 0 0.0 0 -1 0 Mr 1 0 23 starboard B 1 2 29.7000 -0.246226 -0.048494 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 58.00 59.0 59 1
632 32.0 B50 C 30.5000 Stahelin-Maeglin, Dr. Max 0 633 1 male 0 1.0 13214 male_adult 0 1 0.0 0 0.0 0 -1 0 stahelin-maeglin 1 0.0 0 0.0 0 -1 0 Dr 5 0 25 port B 1 2 30.5000 -0.240170 -0.038772 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 32.00 49.0 49 1
839 NaN C47 C 29.7000 Marechal, Mr. Pierre 0 840 1 male 0 1.0 11774 male_adult 0 1 0.0 0 0.0 0 -1 0 marechal 1 0.0 0 0.0 0 -1 0 Mr 1 0 20 starboard C 1 3 29.7000 -0.246226 -0.048494 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 42.58 NaN 49 1
889 26.0 C148 C 30.0000 Behr, Mr. Karl Howell 0 890 1 male 0 1.0 111369 male_adult 0 1 0.0 0 0.0 0 -1 0 behr 1 0.0 0 0.0 0 -1 0 Mr 1 0 21 port C 1 3 30.0000 -0.243936 -0.044818 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 26.00 30.0 30 1
959 31.0 C53 C 28.5375 Tucker, Mr. Gilbert Milligan Jr 0 960 1 male 0 NaN 2543 male_adult 0 1 0.0 0 0.0 0 -1 0 tucker 1 0.0 0 0.0 0 -1 0 Mr 1 0 31 starboard C 1 3 28.5375 -0.255323 -0.063098 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 31.00 49.0 49 1
1022 53.0 C51 C 28.5000 Gracie, Col. Archibald IV 0 1023 1 male 0 NaN 113780 male_adult 0 1 0.0 0 0.0 0 -1 0 gracie 1 0.0 0 0.0 0 -1 0 Col 7 0 25 starboard C 1 3 28.5000 -0.255622 -0.063579 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 53.00 59.0 59 1

8 rows × 139 columns


In [265]:
display(df[(df['Sex'] == 'female') & 
   (df['Fare_standard_score_with_Pclass'] <= -0.18) & 
   (df['Age_pred'] > 30) & 
   (df['Pclass'] == 3) & 
   (df['Name_title_Miss'] == 1.0)
  ])

# poor old miss
df['Poor_Old_Miss_Third_Class'] = 0
df.loc[df[(df['Sex'] == 'female') & 
   (df['Fare_standard_score_with_Pclass'] <= -0.18) & 
   (df['Age'] > 30) & 
   (df['Pclass'] == 3) & 
   (df['Name_title_Miss'] == 1.0)].index, 'Poor_Old_Miss_Third_Class'] = 1
       
if 'Poor_Old_Miss_Third_Class' not in numerical_columns:
    numerical_columns.append('Poor_Old_Miss_Third_Class')


Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket SexAdult Embarked_Category TicketMembers Ticket_perishing_women TicketGroup_include_perishing_women Ticket_surviving_men TicketGroup_include_surviving_men TicketId TicketMembers_Simple surname SurnameMembers Surname_perishing_women SurnameGroup_include_perishing_women Surname_surviving_men SurnameGroup_include_surviving_men SurnameId SurnameMembers_Simple Name_title Name_titleCategory FamilySize NameLength CabinLocation CabinDeck CabinCount CabinCategory Fare_per_ticket_member Fare_standard_score_with_Pclass Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix Sex_female Sex_male Embarked_C Embarked_Q Embarked_S Embarked_unknown SexAdult_child SexAdult_female_adult SexAdult_male_adult ... CabinDeck_F CabinDeck_G CabinDeck_T CabinDeck_no_cabin CabinCategory_0 CabinCategory_1 CabinCategory_2 CabinCategory_3 CabinCategory_4 CabinCategory_5 CabinCategory_6 CabinCategory_7 CabinCategory_8 TicketPrefix_A TicketPrefix_AQ TicketPrefix_AS TicketPrefix_C TicketPrefix_CA TicketPrefix_CASOTON TicketPrefix_FC TicketPrefix_FCC TicketPrefix_Fa TicketPrefix_LINE TicketPrefix_LP TicketPrefix_PC TicketPrefix_PP TicketPrefix_PPP TicketPrefix_SC TicketPrefix_SCA TicketPrefix_SCAH TicketPrefix_SCAH Basle TicketPrefix_SCOW TicketPrefix_SCPARIS TicketPrefix_SCParis TicketPrefix_SOC TicketPrefix_SOP TicketPrefix_SOPP TicketPrefix_SOTONO TicketPrefix_SOTONOQ TicketPrefix_SP TicketPrefix_STONO TicketPrefix_STONOQ TicketPrefix_SWPP TicketPrefix_WC TicketPrefix_WEP TicketPrefix_unknown Age_pred AgeGroup AgeGroup_pred Frugal_First_Class_Single_Man
32 NaN NaN Q 7.7500 Glynn, Miss. Mary Agatha 0 33 3 female 0 1.0 335677 female_adult 1 1 0.0 0 0.0 0 -1 0 glynn 1 0.0 0 0.0 0 -1 0 Miss 2 0 24 no_cabin no_cabin 0 0 7.7500 -0.220785 0.141736 unknown 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 35.000 NaN 49 0
264 NaN NaN Q 7.7500 Henry, Miss. Delia 0 265 3 female 0 0.0 382649 female_adult 1 1 0.0 0 0.0 0 -1 0 henry 1 0.0 0 0.0 0 -1 0 Miss 2 0 18 no_cabin no_cabin 0 0 7.7500 -0.220785 0.141736 unknown 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 30.500 NaN 49 0
276 45.0 NaN S 7.7500 Lindblom, Miss. Augusta Charlotta 0 277 3 female 0 0.0 347073 female_adult 2 1 0.0 0 0.0 0 -1 0 lindblom 1 0.0 0 0.0 0 -1 0 Miss 2 0 33 no_cabin no_cabin 0 0 7.7500 -0.220785 0.141736 unknown 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 45.000 49.0 49 0
396 31.0 NaN S 7.8542 Olsson, Miss. Elina 0 397 3 female 0 0.0 350407 female_adult 2 1 0.0 0 0.0 0 -1 0 olsson 3 0.0 0 0.0 0 600 1 Miss 2 0 19 no_cabin no_cabin 0 0 7.8542 -0.215323 0.175726 unknown 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 31.000 49.0 49 0
573 NaN NaN Q 7.7500 Kelly, Miss. Mary 0 574 3 female 0 1.0 14312 female_adult 1 1 0.0 0 0.0 0 -1 0 kelly 5 0.0 0 0.0 0 406 2 Miss 2 0 17 no_cabin no_cabin 0 0 7.7500 -0.220785 0.141736 unknown 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 30.330 NaN 49 0
727 NaN NaN Q 7.7375 Mannion, Miss. Margareth 0 728 3 female 0 1.0 36866 female_adult 1 1 0.0 0 0.0 0 -1 0 mannion 1 0.0 0 0.0 0 -1 0 Miss 2 0 24 no_cabin no_cabin 0 0 7.7375 -0.221445 0.137628 unknown 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 30.845 NaN 49 0
767 30.5 NaN Q 7.7500 Mangan, Miss. Mary 0 768 3 female 0 0.0 364850 female_adult 1 1 0.0 0 0.0 0 -1 0 mangan 1 0.0 0 0.0 0 -1 0 Miss 2 0 18 no_cabin no_cabin 0 0 7.7500 -0.220785 0.141736 unknown 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 30.500 49.0 49 0
1097 35.0 NaN Q 7.7500 McGowan, Miss. Katherine 0 1098 3 female 0 NaN 9232 female_adult 1 1 0.0 0 0.0 0 -1 0 mcgowan 2 0.0 0 0.0 0 -1 1 Miss 2 0 24 no_cabin no_cabin 0 0 7.7500 -0.220785 0.141736 unknown 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 35.000 49.0 49 0
1105 38.0 NaN S 7.7750 Andersson, Miss. Ida Augusta Margareta 2 1106 3 female 4 NaN 347091 female_adult 2 1 0.0 0 0.0 0 -1 0 andersson 11 1.0 1 0.0 0 21 2 Miss 2 6 38 no_cabin no_cabin 0 0 7.7750 -0.219468 0.149933 unknown 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 38.000 49.0 49 0
1204 37.0 NaN Q 7.7500 Carr, Miss. Jeannie 0 1205 3 female 0 NaN 368364 female_adult 1 1 0.0 0 0.0 0 -1 0 carr 2 0.0 0 0.0 0 -1 1 Miss 2 0 19 no_cabin no_cabin 0 0 7.7500 -0.220785 0.141736 unknown 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 37.000 49.0 49 0

10 rows × 139 columns


In [266]:
display(df[(df['Sex'] == 'female') & 
   (df['Fare_standard_score_with_Pclass'] <= -0.18) & 
   (df['Age_pred'] >= 38) & 
   (df['Pclass'] == 2) & 
   (df['Name_title_Miss'] == 1.0) &
   (df['TicketPrefix_unknown'] == 1.0) &
   (df['SurnameMembers_Simple'] == 0)
  ])

# poor old miss
df['Poor_Old_Miss_Second_Class'] = 0
df.loc[df[
        (df['Sex'] == 'female') & 
        (df['Fare_standard_score_with_Pclass'] <= -0.18) & 
        (df['Age_pred'] >= 38) & 
        (df['Pclass'] == 2) & 
        (df['Name_title_Miss'] == 1.0) &
        (df['TicketPrefix_unknown'] == 1.0) &
        (df['SurnameMembers_Simple'] == 0)
         ].index, 'Poor_Old_Miss_Second_Class'] = 1
       
if 'Poor_Old_Miss_Second_Class' not in numerical_columns:
    numerical_columns.append('Poor_Old_Miss_Second_Class')


Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket SexAdult Embarked_Category TicketMembers Ticket_perishing_women TicketGroup_include_perishing_women Ticket_surviving_men TicketGroup_include_surviving_men TicketId TicketMembers_Simple surname SurnameMembers Surname_perishing_women SurnameGroup_include_perishing_women Surname_surviving_men SurnameGroup_include_surviving_men SurnameId SurnameMembers_Simple Name_title Name_titleCategory FamilySize NameLength CabinLocation CabinDeck CabinCount CabinCategory Fare_per_ticket_member Fare_standard_score_with_Pclass Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix Sex_female Sex_male Embarked_C Embarked_Q Embarked_S Embarked_unknown SexAdult_child SexAdult_female_adult SexAdult_male_adult ... CabinDeck_G CabinDeck_T CabinDeck_no_cabin CabinCategory_0 CabinCategory_1 CabinCategory_2 CabinCategory_3 CabinCategory_4 CabinCategory_5 CabinCategory_6 CabinCategory_7 CabinCategory_8 TicketPrefix_A TicketPrefix_AQ TicketPrefix_AS TicketPrefix_C TicketPrefix_CA TicketPrefix_CASOTON TicketPrefix_FC TicketPrefix_FCC TicketPrefix_Fa TicketPrefix_LINE TicketPrefix_LP TicketPrefix_PC TicketPrefix_PP TicketPrefix_PPP TicketPrefix_SC TicketPrefix_SCA TicketPrefix_SCAH TicketPrefix_SCAH Basle TicketPrefix_SCOW TicketPrefix_SCPARIS TicketPrefix_SCParis TicketPrefix_SOC TicketPrefix_SOP TicketPrefix_SOPP TicketPrefix_SOTONO TicketPrefix_SOTONOQ TicketPrefix_SP TicketPrefix_STONO TicketPrefix_STONOQ TicketPrefix_SWPP TicketPrefix_WC TicketPrefix_WEP TicketPrefix_unknown Age_pred AgeGroup AgeGroup_pred Frugal_First_Class_Single_Man Poor_Old_Miss_Third_Class
357 38.0 NaN S 13.0 Funk, Miss. Annie Clemmer 0 358 2 female 0 0.0 237671 female_adult 2 1 0.0 0 0.0 0 -1 0 funk 1 0.0 0 0.0 0 -1 0 Miss 2 0 25 no_cabin no_cabin 0 0 13.0 -0.186791 0.134032 unknown 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 38.0 49 49 0 0

1 rows × 140 columns


In [267]:
display(df[
    (df['Sex'] == 'female') & 
    (df['Fare_standard_score_with_Pclass'] <= -0.18) & 
    (df['Age_pred'] >= 35) & 
    (df['Pclass'] == 1) & 
    (df['Name_title_Miss'] == 1.0) &
    (df['SurnameMembers_Simple'] == 0)
  ])

# poor old miss
df['Poor_Old_Miss_First_Class'] = 0
df.loc[df[
            (df['Sex'] == 'female') & 
            (df['Fare_standard_score_with_Pclass'] <= -0.18) & 
            (df['Age_pred'] >= 35) & 
            (df['Pclass'] == 1) & 
            (df['Name_title_Miss'] == 1.0) &
            (df['SurnameMembers_Simple'] == 0)
         ].index, 'Poor_Old_Miss_First_Class'] = 1
       
if 'Poor_Old_Miss_First_Class' not in numerical_columns:
    numerical_columns.append('Poor_Old_Miss_First_Class')


Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket SexAdult Embarked_Category TicketMembers Ticket_perishing_women TicketGroup_include_perishing_women Ticket_surviving_men TicketGroup_include_surviving_men TicketId TicketMembers_Simple surname SurnameMembers Surname_perishing_women SurnameGroup_include_perishing_women Surname_surviving_men SurnameGroup_include_surviving_men SurnameId SurnameMembers_Simple Name_title Name_titleCategory FamilySize NameLength CabinLocation CabinDeck CabinCount CabinCategory Fare_per_ticket_member Fare_standard_score_with_Pclass Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix Sex_female Sex_male Embarked_C Embarked_Q Embarked_S Embarked_unknown SexAdult_child SexAdult_female_adult SexAdult_male_adult ... CabinDeck_T CabinDeck_no_cabin CabinCategory_0 CabinCategory_1 CabinCategory_2 CabinCategory_3 CabinCategory_4 CabinCategory_5 CabinCategory_6 CabinCategory_7 CabinCategory_8 TicketPrefix_A TicketPrefix_AQ TicketPrefix_AS TicketPrefix_C TicketPrefix_CA TicketPrefix_CASOTON TicketPrefix_FC TicketPrefix_FCC TicketPrefix_Fa TicketPrefix_LINE TicketPrefix_LP TicketPrefix_PC TicketPrefix_PP TicketPrefix_PPP TicketPrefix_SC TicketPrefix_SCA TicketPrefix_SCAH TicketPrefix_SCAH Basle TicketPrefix_SCOW TicketPrefix_SCPARIS TicketPrefix_SCParis TicketPrefix_SOC TicketPrefix_SOP TicketPrefix_SOPP TicketPrefix_SOTONO TicketPrefix_SOTONOQ TicketPrefix_SP TicketPrefix_STONO TicketPrefix_STONOQ TicketPrefix_SWPP TicketPrefix_WC TicketPrefix_WEP TicketPrefix_unknown Age_pred AgeGroup AgeGroup_pred Frugal_First_Class_Single_Man Poor_Old_Miss_Third_Class Poor_Old_Miss_Second_Class
177 50.0 C49 C 28.7125 Isham, Miss. Ann Elizabeth 0 178 1 female 0 0.0 PC 17595 female_adult 0 1 0.0 0 0.0 0 -1 0 isham 1 0.0 0 0.0 0 -1 0 Miss 2 0 26 starboard C 1 3 28.7125 -0.253930 -0.060862 PC 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 50.0 59 59 0 0 0
1003 36.0 A29 C 31.6792 Evans, Miss. Edith Corse 0 1004 1 female 0 NaN PC 17531 female_adult 0 1 0.0 0 0.0 0 -1 0 evans 1 0.0 0 0.0 0 -1 0 Miss 2 0 24 starboard A 1 1 31.6792 -0.231528 -0.024897 PC 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 36.0 49 49 0 0 0

2 rows × 141 columns


In [268]:
df[(df['Sex'] == 'female') & (df['Fare'] <= 10) & (df['Age'] > 28) & (df['Name_title_Miss'] == 1.0)]

# poor old miss
df['Poor_Old_Miss'] = 0
df.loc[df[(df['Sex'] == 'female') 
         & (df['Fare'] <= 10) 
         & (df['Age_pred'] > 28) 
         & (df['Name_title_Miss'] == 1.0)].index, 'Poor_Old_Miss'] = 1
       
if 'Poor_Old_Miss' not in numerical_columns:
    numerical_columns.append('Poor_Old_Miss')

In [269]:
df[(df['Sex'] == 'female') & (df['Fare'] <= 10) & (df['Age'] > 26) & (df['Embarked'] == 'S') & (df['Name_title_Miss'] == 1.0)]

# poor Shouthampton old miss
df['Poor_Shouthampton_Old_Miss'] = 0
df.loc[df[(df['Sex'] == 'female') 
         & (df['Fare'] <= 10) 
         & (df['Age_pred'] > 26) 
         & (df['Embarked'] == 'S') 
         & (df['Name_title_Miss'] == 1.0)].index, 'Poor_Shouthampton_Old_Miss'] = 1
       
if 'Poor_Shouthampton_Old_Miss' not in numerical_columns:
    numerical_columns.append('Poor_Shouthampton_Old_Miss')

In [270]:
# feature selection
from sklearn.feature_selection import SelectKBest, f_classif

df_copied = df.copy()
df_copied['Name_titleCategory'] = df_copied['Name_titleCategory'].fillna(' ')
df_copied['Cabin'] = df_copied['Cabin'].fillna(' ')
df_copied['Age'] = df_copied['Age'].fillna(-300)
df_copied['AgeGroup'] = df_copied['AgeGroup'].fillna(-1.0)

train = df_copied[0:891].copy()
target = train["Survived"].values

selector = SelectKBest(f_classif, k=len(numerical_columns))
selector.fit(train[numerical_columns], target)
scores = -np.log10(selector.pvalues_)
indices = np.argsort(scores)[::-1]
print("Features importance :")
for f in range(len(scores)):
    print("%0.2f %s" % (scores[indices[f]],numerical_columns[indices[f]]))


Features importance :
nan Name_title_Dona
nan TicketPrefix_LP
nan TicketPrefix_AQ
nan TicketPrefix_STONOQ
70.61 Name_titleCategory_1
70.61 Name_title_Mr
68.85 Sex_female
68.85 Sex_male
64.90 SexAdult_male_adult
53.23 SexAdult_female_adult
25.12 Name_titleCategory_3
24.68 Name_title_Mrs
24.60 Pclass
24.17 Name_titleCategory_2
23.69 NameLength
23.35 Name_title_Miss
21.51 CabinCategory_0
21.51 CabinLocation_no_cabin
21.51 CabinDeck_no_cabin
17.23 CabinCount
17.00 TicketGroup_include_surviving_men
16.30 Ticket_surviving_men
14.21 Fare
13.54 TicketGroup_include_perishing_women
13.22 CabinLocation_starboard
13.04 SurnameGroup_include_surviving_men
12.71 Surname_surviving_men
10.83 Ticket_perishing_women
10.36 SurnameGroup_include_perishing_women
9.04 Surname_perishing_women
6.84 CabinCategory_2
6.84 CabinDeck_B
6.73 CabinLocation_port
6.36 Embarked_Category_0
6.36 Embarked_C
5.52 Embarked_Category_2
5.52 Embarked_S
5.21 CabinCategory_4
5.21 CabinDeck_D
4.98 TicketPrefix_PC
4.88 CabinCategory_5
4.88 CabinDeck_E
3.59 SexAdult_child
3.35 Fare_standard_score_with_Pclass
3.27 TicketPrefix_A
3.22 CabinDeck_C
3.22 CabinCategory_3
1.96 Name_title_Master
1.96 Name_titleCategory_4
1.95 Age
1.83 Parch
1.43 Age_pred
1.35 TicketPrefix_SOTONOQ
1.28 Name_titleCategory_6
1.28 Name_title_Rev
1.28 TicketMembers
1.26 TicketPrefix_FCC
1.25 AgeGroup_pred
1.20 TicketPrefix_WC
1.14 Name_title_Mlle
1.14 TicketPrefix_SWPP
1.14 Embarked_Category_3
1.14 Embarked_unknown
1.08 CabinCategory_6
1.08 CabinDeck_F
1.07 TicketId
0.81 Frugal_First_Class_Single_Man
0.77 Poor_Old_Miss_Third_Class
0.77 TicketPrefix_SOPP
0.72 Poor_Shouthampton_Old_Miss
0.69 TicketPrefix_SCAH Basle
0.69 Name_title_Mme
0.69 Name_title_Lady
0.69 Name_title_Sir
0.69 Name_title_Countess
0.69 TicketPrefix_SC
0.58 TicketPrefix_SOTONO
0.56 TicketPrefix_SOC
0.53 SibSp
0.50 Name_titleCategory_10
0.50 TicketPrefix_PP
0.50 Fare_per_ticket_member_standard_score_with_Pclass
0.37 CabinDeck_T
0.37 Name_title_Capt
0.37 TicketPrefix_CASOTON
0.37 TicketPrefix_SOP
0.37 CabinCategory_8
0.37 TicketPrefix_FC
0.37 TicketPrefix_SP
0.37 Poor_Old_Miss_Second_Class
0.37 Poor_Old_Miss_First_Class
0.37 TicketPrefix_SCA
0.37 TicketPrefix_Fa
0.37 Name_title_Don
0.37 Name_title_Jonkheer
0.37 TicketPrefix_SCOW
0.37 TicketPrefix_AS
0.35 SurnameMembers
0.30 CabinDeck_A
0.30 CabinCategory_1
0.25 TicketPrefix_CA
0.24 TicketPrefix_LINE
0.23 TicketPrefix_STONO
0.21 FamilySize
0.20 TicketPrefix_SCParis
0.20 CabinDeck_G
0.20 CabinLocation_unknown
0.20 CabinCategory_7
0.13 TicketPrefix_PPP
0.13 Name_titleCategory_9
0.13 Name_title_Col
0.13 TicketPrefix_SCAH
0.13 Name_title_Major
0.09 Name_titleCategory_5
0.09 TicketPrefix_SCPARIS
0.09 Name_title_Dr
0.08 AgeGroup
0.08 SurnameId
0.07 TicketPrefix_WEP
0.04 Embarked_Category_1
0.04 Embarked_Q
0.04 Poor_Old_Miss
0.03 TicketPrefix_C
0.03 Name_titleCategory_7
0.02 TicketPrefix_unknown
c:\develop\python27\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [ 51 113 114 115] are constant.
  UserWarning)

In [271]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

random_forest = RandomForestClassifier(n_estimators=3000, min_samples_split=4, class_weight={0:0.745, 1:0.255})
kfold = cross_validation.KFold(train.shape[0], n_folds=3, random_state=42)

scores = cross_validation.cross_val_score(random_forest, train[numerical_columns], target, cv=kfold)
print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean() * 100, scores.std() * 100, 'Random Forest Cross Validation'))

random_forest.fit(train[numerical_columns], target)
score = random_forest.score(train[numerical_columns], target)
print("Accuracy: %0.3f             [%s]" % (score * 100, 'Random Forest full test'))

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(len(numerical_columns)):
    print("%d. feature %d (%f) %s" % (f + 1, indices[f] + 1, importances[indices[f]] * 100, numerical_columns[indices[f]]))


Accuracy: 90.236 (+/- 1.20) [Random Forest Cross Validation]
Accuracy: 96.633             [Random Forest full test]
1. feature 23 (4.627239) Sex_male
2. feature 19 (4.499238) NameLength
3. feature 24 (4.458889) Sex_female
4. feature 5 (4.326391) Fare
5. feature 117 (4.148197) Age_pred
6. feature 6 (4.127816) Ticket_perishing_women
7. feature 7 (3.989957) TicketGroup_include_perishing_women
8. feature 21 (3.964469) Fare_standard_score_with_Pclass
9. feature 53 (3.858035) Name_titleCategory_1
10. feature 12 (3.774914) Surname_perishing_women
11. feature 36 (3.649514) Name_title_Mr
12. feature 22 (3.557337) Fare_per_ticket_member_standard_score_with_Pclass
13. feature 13 (3.501268) SurnameGroup_include_perishing_women
14. feature 29 (3.438745) SexAdult_male_adult
15. feature 2 (2.909823) Age
16. feature 30 (2.813614) SexAdult_female_adult
17. feature 1 (2.770948) Pclass
18. feature 8 (2.313915) Ticket_surviving_men
19. feature 9 (2.304947) TicketGroup_include_surviving_men
20. feature 11 (2.067989) TicketMembers
21. feature 18 (1.571138) FamilySize
22. feature 119 (1.489239) AgeGroup_pred
23. feature 17 (1.366975) SurnameMembers
24. feature 118 (1.293844) AgeGroup
25. feature 37 (1.279580) Name_title_Mrs
26. feature 54 (1.274140) Name_titleCategory_3
27. feature 14 (1.252589) Surname_surviving_men
28. feature 55 (1.193222) Name_titleCategory_2
29. feature 15 (1.174035) SurnameGroup_include_surviving_men
30. feature 38 (1.093298) Name_title_Miss
31. feature 75 (0.919906) CabinCategory_0
32. feature 62 (0.904816) CabinLocation_no_cabin
33. feature 3 (0.859511) SibSp
34. feature 66 (0.840639) CabinDeck_no_cabin
35. feature 20 (0.837275) CabinCount
36. feature 16 (0.797545) SurnameId
37. feature 10 (0.740958) TicketId
38. feature 63 (0.476063) CabinLocation_starboard
39. feature 4 (0.470066) Parch
40. feature 87 (0.464124) TicketPrefix_unknown
41. feature 103 (0.455635) TicketPrefix_SWPP
42. feature 25 (0.425454) Embarked_S
43. feature 32 (0.407843) Embarked_Category_2
44. feature 31 (0.404095) SexAdult_child
45. feature 26 (0.336420) Embarked_C
46. feature 77 (0.321972) CabinCategory_5
47. feature 125 (0.321428) Poor_Shouthampton_Old_Miss
48. feature 68 (0.310921) CabinDeck_E
49. feature 33 (0.303446) Embarked_Category_0
50. feature 39 (0.283680) Name_title_Master
51. feature 56 (0.282900) Name_titleCategory_4
52. feature 86 (0.281732) TicketPrefix_STONO
53. feature 64 (0.240633) CabinLocation_port
54. feature 85 (0.240431) TicketPrefix_PC
55. feature 124 (0.226805) Poor_Old_Miss
56. feature 34 (0.223911) Embarked_Category_1
57. feature 27 (0.222036) Embarked_Q
58. feature 120 (0.216239) Frugal_First_Class_Single_Man
59. feature 122 (0.172032) Poor_Old_Miss_Second_Class
60. feature 121 (0.162556) Poor_Old_Miss_Third_Class
61. feature 67 (0.160834) CabinDeck_C
62. feature 76 (0.153976) CabinCategory_3
63. feature 72 (0.153036) CabinDeck_B
64. feature 81 (0.152690) CabinCategory_2
65. feature 70 (0.146989) CabinDeck_D
66. feature 97 (0.138446) TicketPrefix_C
67. feature 79 (0.138029) CabinCategory_4
68. feature 123 (0.135876) Poor_Old_Miss_First_Class
69. feature 110 (0.120063) TicketPrefix_SOPP
70. feature 95 (0.115898) TicketPrefix_SOTONOQ
71. feature 58 (0.109808) Name_titleCategory_6
72. feature 41 (0.098383) Name_title_Rev
73. feature 89 (0.097664) TicketPrefix_CA
74. feature 84 (0.088309) TicketPrefix_A
75. feature 94 (0.084395) TicketPrefix_WC
76. feature 60 (0.082761) Name_titleCategory_7
77. feature 98 (0.078718) TicketPrefix_SCPARIS
78. feature 80 (0.077343) CabinCategory_1
79. feature 71 (0.066849) CabinDeck_A
80. feature 59 (0.066597) Name_titleCategory_5
81. feature 42 (0.061349) Name_title_Dr
82. feature 73 (0.045530) CabinDeck_F
83. feature 101 (0.042656) TicketPrefix_LINE
84. feature 44 (0.042127) Name_title_Major
85. feature 82 (0.040519) CabinCategory_6
86. feature 48 (0.039414) Name_title_Col
87. feature 69 (0.031023) CabinDeck_G
88. feature 78 (0.029813) CabinCategory_7
89. feature 102 (0.022288) TicketPrefix_FCC
90. feature 49 (0.019940) Name_title_Capt
91. feature 96 (0.015541) TicketPrefix_WEP
92. feature 57 (0.013597) Name_titleCategory_9
93. feature 93 (0.011283) TicketPrefix_SOC
94. feature 46 (0.008247) Name_title_Sir
95. feature 40 (0.007050) Name_title_Don
96. feature 106 (0.007004) TicketPrefix_SC
97. feature 61 (0.006852) Name_titleCategory_10
98. feature 65 (0.006806) CabinLocation_unknown
99. feature 90 (0.005564) TicketPrefix_SCParis
100. feature 51 (0.004654) Name_title_Jonkheer
101. feature 47 (0.004555) Name_title_Mlle
102. feature 88 (0.004119) TicketPrefix_PP
103. feature 74 (0.002629) CabinDeck_T
104. feature 35 (0.002126) Embarked_Category_3
105. feature 45 (0.001901) Name_title_Lady
106. feature 83 (0.001782) CabinCategory_8
107. feature 111 (0.001704) TicketPrefix_FC
108. feature 28 (0.001523) Embarked_unknown
109. feature 92 (0.001268) TicketPrefix_SP
110. feature 112 (0.001264) TicketPrefix_SOTONO
111. feature 50 (0.001255) Name_title_Countess
112. feature 109 (0.001002) TicketPrefix_SCAH Basle
113. feature 99 (0.000887) TicketPrefix_SOP
114. feature 105 (0.000826) TicketPrefix_PPP
115. feature 107 (0.000650) TicketPrefix_SCAH
116. feature 113 (0.000617) TicketPrefix_CASOTON
117. feature 104 (0.000520) TicketPrefix_SCOW
118. feature 91 (0.000462) TicketPrefix_SCA
119. feature 108 (0.000442) TicketPrefix_AS
120. feature 100 (0.000170) TicketPrefix_Fa
121. feature 114 (0.000000) TicketPrefix_STONOQ
122. feature 115 (0.000000) TicketPrefix_AQ
123. feature 116 (0.000000) TicketPrefix_LP
124. feature 52 (0.000000) Name_title_Dona
125. feature 43 (0.000000) Name_title_Mme

In [272]:
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

ensemble_clfs = [
    ("RandomForestClassifier",
        RandomForestClassifier(
                               n_estimators=3000,
                               n_jobs=8,
                               class_weight={0:0.745, 1:0.255},
                               min_samples_split=4,
                               random_state=42)),
]

kfold = cross_validation.KFold(train.shape[0], n_folds=3, random_state=42)

In [273]:
# error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# for label, clf in ensemble_clfs:
#     print("Classifier : %s" % label)
#     for i in range(2, 10):
#         clf.set_params(min_samples_split=i)
#         clf.fit(train[numerical_columns], target)
#         scores = cross_validation.cross_val_score(clf, train[numerical_columns], target, cv=kfold)
#         error_rate[label].append((i, scores.mean() * 100))
#         print("%d estimator" % i)
        
# for label, clf_err in error_rate.items():
#     xs, ys = zip(*clf_err)
#     plt.plot(xs, ys, label=label)
    
# plt.xlim(2, 10)
# plt.xlabel("min_samples_split")
# plt.ylabel("score")
# plt.legend(loc="upper right")
# plt.show()

In [274]:
# error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# for label, clf in ensemble_clfs:
#     print("Classifier : %s" % label)
#     for i in range(1, 100, 10):
#         clf.set_params(max_depth=i)
#         clf.fit(train[numerical_columns], target)
#         scores = cross_validation.cross_val_score(clf, train[numerical_columns], target, cv=kfold)
# #         error_rate[label].append((i, scores.mean() * 100))
#         print("max_depth : %d" % i)
        
# for label, clf_err in error_rate.items():
#     xs, ys = zip(*clf_err)
#     plt.plot(xs, ys, label=label)
    
# plt.xlim(0, 100)
# plt.xlabel("max_depth")
# plt.ylabel("score")
# plt.legend(loc="upper right")
# plt.show()

In [275]:
features = [
            'Sex_female','Sex_male',
    
            'Age_pred',
    
            'SexAdult_male_adult','SexAdult_female_adult', 'SexAdult_child',
    
            'Name_titleCategory',
#             'Name_titleCategory_1',
#             'Name_titleCategory_2',
#             'Name_titleCategory_3',
#             'Name_titleCategory_4',
#             'Name_titleCategory_5',
#             'Name_titleCategory_6',
#             'Name_titleCategory_7',
#             'Name_titleCategory_9',
#             'Name_titleCategory_10',
#             'Name_title_Mr', 'Name_title_Mrs', 'Name_title_Miss', 'Name_title_Master', 
#             'Name_title_Don', 'Name_title_Rev', 'Name_title_Dr', 'Name_title_Mme', 
#             'Name_title_Major', 'Name_title_Lady', 'Name_title_Sir', 'Name_title_Mlle', 'Name_title_Col', 
#             'Name_title_Capt', 'Name_title_Countess', 'Name_title_Jonkheer', 

            'Pclass', 
            
            'TicketId',
    
            'NameLength',

            'CabinLocation_no_cabin', 'CabinLocation_starboard', 'CabinLocation_port', 
            'CabinCategory',
#             'CabinCategory_0',
#             'CabinCategory_1',
#             'CabinCategory_2',
#             'CabinCategory_3',
#             'CabinCategory_4',
#             'CabinCategory_5',
#             'CabinCategory_6',
#             'CabinCategory_7',
#             'CabinCategory_8',
#             'CabinDeck_C', 'CabinDeck_E', 'CabinDeck_G', 'CabinDeck_D', 'CabinDeck_A', 'CabinDeck_B', 'CabinDeck_F', 'CabinDeck_T','CabinDeck_no_cabin', 

            'SibSp','Parch',
    
            'Fare',
#             'Fare_per_ticket_member',
#             'Fare_standard_score_with_Pclass',
#             'Fare_per_ticket_member_standard_score_with_Pclass',
    
            'Embarked_Category',
#             'Embarked_S','Embarked_Q','Embarked_C','Embarked_unknown',
    
            'SurnameMembers_Simple','SurnameGroup_include_perishing_women','SurnameGroup_include_surviving_men',
    
            'TicketMembers_Simple', 'TicketGroup_include_perishing_women','TicketGroup_include_surviving_men',
    
            'FamilySize', 

#             'Frugal_First_Class_Single_Man',
#             'Poor_Old_Miss',
#             'Poor_Shouthampton_Old_Miss',
#             'Poor_Old_Miss_Third_Class',
#             'Poor_Old_Miss_Second_Class',
#             'Poor_Old_Miss_First_Class',
    
#             'TicketPrefix_SOPP', 'TicketPrefix_WC',
#             'TicketPrefix_unknown', 
#             'TicketPrefix_SCA','TicketPrefix_SP','TicketPrefix_SOP','TicketPrefix_Fa','TicketPrefix_SCOW','TicketPrefix_AS',
#             'TicketPrefix_FC','TicketPrefix_SOTONO','TicketPrefix_CASOTON','TicketPrefix_SWPP','TicketPrefix_SC','TicketPrefix_SCAH Basle',
    
#             'CabinCount',
           ]

In [276]:
# analyze failed.
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, target, test_size=0.2, random_state=42)
random_forest = RandomForestClassifier(n_estimators=3000, min_samples_split=4, class_weight={0:0.745, 1:0.255})
kfold = cross_validation.KFold(X_train.shape[0], n_folds=3, random_state=42)

scores = cross_validation.cross_val_score(random_forest, X_train[features], y_train, cv=kfold)
print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean() * 100, scores.std() * 100, 'Random Forest Cross Validation'))

random_forest.fit(X_train[features], y_train)
score = random_forest.score(X_test[features], y_test)
print("Accuracy: %0.3f             [%s]" % (score * 100, 'Random Forest full test'))
pred_test = random_forest.predict(X_test[features])

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(len(features)):
    print("%d. feature %d (%f) %s" % (f + 1, indices[f] + 1, importances[indices[f]] * 100, features[indices[f]]))


Accuracy: 89.187 (+/- 1.19) [Random Forest Cross Validation]
Accuracy: 91.620             [Random Forest full test]
1. feature 17 (9.272662) Fare
2. feature 3 (8.709954) Age_pred
3. feature 10 (7.882402) NameLength
4. feature 23 (7.649029) TicketGroup_include_perishing_women
5. feature 7 (7.161242) Name_titleCategory
6. feature 20 (7.011301) SurnameGroup_include_perishing_women
7. feature 2 (6.655039) Sex_male
8. feature 8 (6.208368) Pclass
9. feature 1 (5.798096) Sex_female
10. feature 4 (4.838878) SexAdult_male_adult
11. feature 24 (4.692014) TicketGroup_include_surviving_men
12. feature 5 (3.986412) SexAdult_female_adult
13. feature 22 (3.011894) TicketMembers_Simple
14. feature 25 (2.667078) FamilySize
15. feature 21 (2.497202) SurnameGroup_include_surviving_men
16. feature 14 (2.357309) CabinCategory
17. feature 19 (1.933670) SurnameMembers_Simple
18. feature 18 (1.596739) Embarked_Category
19. feature 9 (1.193332) TicketId
20. feature 11 (1.130423) CabinLocation_no_cabin
21. feature 15 (1.121023) SibSp
22. feature 12 (0.764313) CabinLocation_starboard
23. feature 6 (0.743239) SexAdult_child
24. feature 16 (0.717791) Parch
25. feature 13 (0.400588) CabinLocation_port

In [277]:
pd.set_option("display.max_columns",101)
X_test_reseted = X_test.reset_index()
X_test_reseted['Survived_'] = y_test
X_test_reseted['Prediction'] = pred_test
X_test_reseted['pred_result'] = pred_test == y_test

In [278]:
display(X_test_reseted[(X_test_reseted['Survived'] == 1.0) & (X_test_reseted['pred_result'] == False)])


index Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket SexAdult Embarked_Category TicketMembers Ticket_perishing_women TicketGroup_include_perishing_women Ticket_surviving_men TicketGroup_include_surviving_men TicketId TicketMembers_Simple surname SurnameMembers Surname_perishing_women SurnameGroup_include_perishing_women Surname_surviving_men SurnameGroup_include_surviving_men SurnameId SurnameMembers_Simple Name_title Name_titleCategory FamilySize NameLength CabinLocation CabinDeck CabinCount CabinCategory Fare_per_ticket_member Fare_standard_score_with_Pclass Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix Sex_female Sex_male Embarked_C Embarked_Q Embarked_S Embarked_unknown SexAdult_child SexAdult_female_adult ... CabinCategory_4 CabinCategory_5 CabinCategory_6 CabinCategory_7 CabinCategory_8 TicketPrefix_A TicketPrefix_AQ TicketPrefix_AS TicketPrefix_C TicketPrefix_CA TicketPrefix_CASOTON TicketPrefix_FC TicketPrefix_FCC TicketPrefix_Fa TicketPrefix_LINE TicketPrefix_LP TicketPrefix_PC TicketPrefix_PP TicketPrefix_PPP TicketPrefix_SC TicketPrefix_SCA TicketPrefix_SCAH TicketPrefix_SCAH Basle TicketPrefix_SCOW TicketPrefix_SCPARIS TicketPrefix_SCParis TicketPrefix_SOC TicketPrefix_SOP TicketPrefix_SOPP TicketPrefix_SOTONO TicketPrefix_SOTONOQ TicketPrefix_SP TicketPrefix_STONO TicketPrefix_STONOQ TicketPrefix_SWPP TicketPrefix_WC TicketPrefix_WEP TicketPrefix_unknown Age_pred AgeGroup AgeGroup_pred Frugal_First_Class_Single_Man Poor_Old_Miss_Third_Class Poor_Old_Miss_Second_Class Poor_Old_Miss_First_Class Poor_Old_Miss Poor_Shouthampton_Old_Miss Survived_ Prediction pred_result
21 447 34.0 S 26.5500 Seward, Mr. Frederic Kimber 0 448 1 male 0 1.0 113794 male_adult 2 1 0.0 0 0.0 0 -1 0 seward 1 0.0 0 0.0 0 -1 0 Mr 1 0 27 no_cabin no_cabin 0 0 26.5500 -0.271769 -0.089502 unknown 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 34.0 49 49 0 0 0 0 0 0 1.0 0.0 False
27 673 31.0 S 13.0000 Wilhelms, Mr. Charles 0 674 2 male 0 1.0 244270 male_adult 2 1 0.0 0 0.0 0 -1 0 wilhelms 1 0.0 0 0.0 0 -1 0 Mr 1 0 21 no_cabin no_cabin 0 0 13.0000 -0.186791 0.134032 unknown 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 31.0 49 49 0 0 0 0 0 0 1.0 0.0 False
34 204 18.0 S 8.0500 Cohen, Mr. Gurshon "Gus" 0 205 3 male 0 1.0 A/5 3540 male_adult 2 1 0.0 0 0.0 0 -1 0 cohen 1 0.0 0 0.0 0 -1 0 Mr 1 0 24 no_cabin no_cabin 0 0 8.0500 -0.205253 0.238394 A 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18.0 18 18 0 0 0 0 0 0 1.0 0.0 False
36 23 28.0 A6 S 35.5000 Sloper, Mr. William Thompson 0 24 1 male 0 1.0 113788 male_adult 2 1 0.0 0 0.0 0 -1 0 sloper 1 0.0 0 0.0 0 -1 0 Mr 1 0 28 port A 1 1 35.5000 -0.205583 0.016756 unknown 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 28.0 30 30 0 0 0 0 0 0 1.0 0.0 False
72 889 26.0 C148 C 30.0000 Behr, Mr. Karl Howell 0 890 1 male 0 1.0 111369 male_adult 0 1 0.0 0 0.0 0 -1 0 behr 1 0.0 0 0.0 0 -1 0 Mr 1 0 21 port C 1 3 30.0000 -0.243936 -0.044818 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 26.0 30 30 1 0 0 0 0 0 1.0 0.0 False
97 338 45.0 S 8.0500 Dahl, Mr. Karl Edwart 0 339 3 male 0 1.0 7598 male_adult 2 1 0.0 0 0.0 0 -1 0 dahl 1 0.0 0 0.0 0 -1 0 Mr 1 0 21 no_cabin no_cabin 0 0 8.0500 -0.205253 0.238394 unknown 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 45.0 49 49 0 0 0 0 0 0 1.0 0.0 False
108 286 30.0 S 9.5000 de Mulder, Mr. Theodore 0 287 3 male 0 1.0 345774 male_adult 2 1 0.0 0 0.0 0 -1 0 de mulder 1 0.0 0 0.0 0 -1 0 Mr 1 0 23 no_cabin no_cabin 0 0 9.5000 -0.137514 0.659950 unknown 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 30.0 30 30 0 0 0 0 0 0 1.0 0.0 False
109 209 40.0 A31 C 31.0000 Blank, Mr. Henry 0 210 1 male 0 1.0 112277 male_adult 0 1 0.0 0 0.0 0 -1 0 blank 1 0.0 0 0.0 0 -1 0 Mr 1 0 16 starboard A 1 1 31.0000 -0.236466 -0.032824 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 40.0 49 49 1 0 0 0 0 0 1.0 0.0 False
113 512 36.0 E25 S 26.2875 McGough, Mr. James Robert 0 513 1 male 0 1.0 PC 17473 male_adult 2 1 0.0 0 0.0 0 -1 0 mcgough 1 0.0 0 0.0 0 -1 0 Mr 1 0 25 starboard E 1 5 26.2875 -0.274033 -0.093136 PC 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 36.0 49 49 0 0 0 0 0 0 1.0 0.0 False
125 604 35.0 C 26.5500 Homer, Mr. Harry ("Mr E Haven") 0 605 1 male 0 1.0 111426 male_adult 0 1 0.0 0 0.0 0 -1 0 homer 1 0.0 0 0.0 0 -1 0 Mr 1 0 31 no_cabin no_cabin 0 0 26.5500 -0.271769 -0.089502 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 35.0 49 49 0 0 0 0 0 0 1.0 0.0 False
161 507 -300.0 S 26.5500 Bradley, Mr. George ("George Arthur Brayton") 0 508 1 male 0 1.0 111427 male_adult 2 1 0.0 0 0.0 0 -1 0 bradley 2 0.0 0 0.0 0 -1 1 Mr 1 0 45 no_cabin no_cabin 0 0 26.5500 -0.271769 -0.089502 unknown 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 45.0 -1 49 0 0 0 0 0 0 1.0 0.0 False
172 572 36.0 E25 S 26.3875 Flynn, Mr. John Irwin ("Irving") 0 573 1 male 0 1.0 PC 17474 male_adult 2 1 0.0 0 0.0 0 -1 0 flynn 3 0.0 0 0.0 0 257 1 Mr 1 0 32 starboard E 1 5 26.3875 -0.273168 -0.091747 PC 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 36.0 49 49 0 0 0 0 0 0 1.0 0.0 False

12 rows × 148 columns


In [279]:
display(X_test_reseted[(X_test_reseted['Survived'] == 0.0) & (X_test_reseted['pred_result'] == False)])


index Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket SexAdult Embarked_Category TicketMembers Ticket_perishing_women TicketGroup_include_perishing_women Ticket_surviving_men TicketGroup_include_surviving_men TicketId TicketMembers_Simple surname SurnameMembers Surname_perishing_women SurnameGroup_include_perishing_women Surname_surviving_men SurnameGroup_include_surviving_men SurnameId SurnameMembers_Simple Name_title Name_titleCategory FamilySize NameLength CabinLocation CabinDeck CabinCount CabinCategory Fare_per_ticket_member Fare_standard_score_with_Pclass Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix Sex_female Sex_male Embarked_C Embarked_Q Embarked_S Embarked_unknown SexAdult_child SexAdult_female_adult ... CabinCategory_4 CabinCategory_5 CabinCategory_6 CabinCategory_7 CabinCategory_8 TicketPrefix_A TicketPrefix_AQ TicketPrefix_AS TicketPrefix_C TicketPrefix_CA TicketPrefix_CASOTON TicketPrefix_FC TicketPrefix_FCC TicketPrefix_Fa TicketPrefix_LINE TicketPrefix_LP TicketPrefix_PC TicketPrefix_PP TicketPrefix_PPP TicketPrefix_SC TicketPrefix_SCA TicketPrefix_SCAH TicketPrefix_SCAH Basle TicketPrefix_SCOW TicketPrefix_SCPARIS TicketPrefix_SCParis TicketPrefix_SOC TicketPrefix_SOP TicketPrefix_SOPP TicketPrefix_SOTONO TicketPrefix_SOTONOQ TicketPrefix_SP TicketPrefix_STONO TicketPrefix_STONOQ TicketPrefix_SWPP TicketPrefix_WC TicketPrefix_WEP TicketPrefix_unknown Age_pred AgeGroup AgeGroup_pred Frugal_First_Class_Single_Man Poor_Old_Miss_Third_Class Poor_Old_Miss_Second_Class Poor_Old_Miss_First_Class Poor_Old_Miss Poor_Shouthampton_Old_Miss Survived_ Prediction pred_result
33 235 -300.0 S 7.55 Harknett, Miss. Alice Phoebe 0 236 3 female 0 0.0 W./C. 6609 female_adult 2 1 0.0 0 0.0 0 -1 0 harknett 1 0.0 0 0.0 0 -1 0 Miss 2 0 28 no_cabin no_cabin 0 0 7.55 -0.231477 0.075199 WC 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 19.31 -1 30 0 0 0 0 0 0 0.0 1.0 False
65 772 57.0 E77 S 10.50 Mack, Mrs. (Mary) 0 773 2 female 0 0.0 S.O./P.P. 3 female_adult 2 2 0.0 0 0.0 0 -1 1 mack 1 0.0 0 0.0 0 -1 0 Mrs 3 0 17 starboard E 1 5 5.25 -0.268509 -0.797750 SOPP 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 57.00 59 59 0 0 0 0 0 0 0.0 1.0 False
78 357 38.0 S 13.00 Funk, Miss. Annie Clemmer 0 358 2 female 0 0.0 237671 female_adult 2 1 0.0 0 0.0 0 -1 0 funk 1 0.0 0 0.0 0 -1 0 Miss 2 0 25 no_cabin no_cabin 0 0 13.00 -0.186791 0.134032 unknown 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 38.00 49 49 0 0 1 0 0 0 0.0 1.0 False

3 rows × 148 columns


In [280]:
# select specidic features
random_forest = RandomForestClassifier(n_estimators=3000, min_samples_split=4, class_weight={0:0.745, 1:0.255})
kfold = cross_validation.KFold(train.shape[0], n_folds=3, random_state=42)

scores = cross_validation.cross_val_score(random_forest, train[features], target, cv=kfold)
print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean() * 100, scores.std() * 100, 'Random Forest Cross Validation'))

random_forest.fit(train[features], target)
score = random_forest.score(train[features], target)
print("Accuracy: %0.3f             [%s]" % (score * 100, 'Random Forest full test'))

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(len(features)):
    print("%d. feature %d (%f) %s" % (f + 1, indices[f] + 1, importances[indices[f]] * 100, features[indices[f]]))


Accuracy: 89.562 (+/- 1.10) [Random Forest Cross Validation]
Accuracy: 96.296             [Random Forest full test]
1. feature 17 (9.195192) Fare
2. feature 3 (8.895260) Age_pred
3. feature 23 (8.440197) TicketGroup_include_perishing_women
4. feature 10 (8.294630) NameLength
5. feature 20 (7.377621) SurnameGroup_include_perishing_women
6. feature 2 (6.882887) Sex_male
7. feature 7 (6.858682) Name_titleCategory
8. feature 1 (6.099656) Sex_female
9. feature 8 (5.239385) Pclass
10. feature 4 (4.682957) SexAdult_male_adult
11. feature 24 (4.608597) TicketGroup_include_surviving_men
12. feature 5 (3.293446) SexAdult_female_adult
13. feature 22 (2.915148) TicketMembers_Simple
14. feature 25 (2.634912) FamilySize
15. feature 14 (2.558125) CabinCategory
16. feature 21 (2.510422) SurnameGroup_include_surviving_men
17. feature 19 (1.855995) SurnameMembers_Simple
18. feature 18 (1.606718) Embarked_Category
19. feature 11 (1.464326) CabinLocation_no_cabin
20. feature 9 (1.163512) TicketId
21. feature 15 (1.143969) SibSp
22. feature 6 (0.658699) SexAdult_child
23. feature 12 (0.615585) CabinLocation_starboard
24. feature 16 (0.602184) Parch
25. feature 13 (0.401896) CabinLocation_port

In [281]:
random_forest = RandomForestClassifier(n_estimators=3000, min_samples_split=4, class_weight={0:0.745, 1:0.255})
test = df_copied[891:].copy()
random_forest.fit(train[features], target)
predictions = random_forest.predict(test[features])

In [282]:
PassengerId = np.array(test["PassengerId"]).astype(int)
submit_df = pd.DataFrame(predictions, PassengerId, columns = ['Survived']).astype(int)
submit_df.to_csv('titanic.csv', index_label=['PassengerId'])

In [283]:
wnot_subimt_df = pd.read_csv("wnot_submit.csv")
wnot_subimt_df = wnot_subimt_df.reset_index().drop('index', axis=1)
wnot_subimt_df = wnot_subimt_df.set_index('PassengerId')
diff = submit_df.copy()
diff['Survived_wnot'] = wnot_subimt_df['Survived']
diff['pred_result'] = diff['Survived_wnot'] == diff['Survived']

In [284]:
display(df_copied.loc[diff[(diff['pred_result'] == False)].index - 1, :])
print(diff[(diff['pred_result'] == False)])


Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket SexAdult Embarked_Category TicketMembers Ticket_perishing_women TicketGroup_include_perishing_women Ticket_surviving_men TicketGroup_include_surviving_men TicketId TicketMembers_Simple surname SurnameMembers Surname_perishing_women SurnameGroup_include_perishing_women Surname_surviving_men SurnameGroup_include_surviving_men SurnameId SurnameMembers_Simple Name_title Name_titleCategory FamilySize NameLength CabinLocation CabinDeck CabinCount CabinCategory Fare_per_ticket_member Fare_standard_score_with_Pclass Fare_per_ticket_member_standard_score_with_Pclass TicketPrefix Sex_female Sex_male Embarked_C Embarked_Q Embarked_S Embarked_unknown SexAdult_child SexAdult_female_adult SexAdult_male_adult ... CabinCategory_1 CabinCategory_2 CabinCategory_3 CabinCategory_4 CabinCategory_5 CabinCategory_6 CabinCategory_7 CabinCategory_8 TicketPrefix_A TicketPrefix_AQ TicketPrefix_AS TicketPrefix_C TicketPrefix_CA TicketPrefix_CASOTON TicketPrefix_FC TicketPrefix_FCC TicketPrefix_Fa TicketPrefix_LINE TicketPrefix_LP TicketPrefix_PC TicketPrefix_PP TicketPrefix_PPP TicketPrefix_SC TicketPrefix_SCA TicketPrefix_SCAH TicketPrefix_SCAH Basle TicketPrefix_SCOW TicketPrefix_SCPARIS TicketPrefix_SCParis TicketPrefix_SOC TicketPrefix_SOP TicketPrefix_SOPP TicketPrefix_SOTONO TicketPrefix_SOTONOQ TicketPrefix_SP TicketPrefix_STONO TicketPrefix_STONOQ TicketPrefix_SWPP TicketPrefix_WC TicketPrefix_WEP TicketPrefix_unknown Age_pred AgeGroup AgeGroup_pred Frugal_First_Class_Single_Man Poor_Old_Miss_Third_Class Poor_Old_Miss_Second_Class Poor_Old_Miss_First_Class Poor_Old_Miss Poor_Shouthampton_Old_Miss
1088 18.0 S 7.7750 Nilsson, Miss. Berta Olivia 0 1089 3 female 0 NaN 347066 female_adult 2 1 0.0 0 0.0 0 -1 0 nilsson 3 0.0 0 0.0 0 577 1 Miss 2 0 27 no_cabin no_cabin 0 0 7.775000 -0.219468 0.149933 unknown 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 18.000000 18 18 0 0 0 0 0 0
1258 22.0 S 39.6875 Riihivouri, Miss. Susanna Juhantytar Sanni"" 0 1259 3 female 0 NaN 3101295 female_adult 2 7 1.0 1 0.0 0 335 2 riihivouri 1 0.0 0 0.0 0 -1 0 Miss 2 0 44 no_cabin no_cabin 0 0 5.669643 0.447599 -0.653553 unknown 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 22.000000 30 30 0 0 0 0 0 0
1308 -300.0 C 22.3583 Peter, Master. Michael J 1 1309 3 male 1 NaN 2668 male_adult 0 3 0.0 0 0.0 0 262 1 peter 3 0.0 0 0.0 0 639 1 Master 4 2 24 no_cabin no_cabin 0 0 7.452767 0.212708 0.042212 unknown 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 6.888267 -1 11 0 0 0 0 0 0

3 rows × 144 columns

      Survived  Survived_wnot pred_result
1089         1              0       False
1259         1              0       False
1309         0              1       False

In [ ]: