In [45]:
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib notebook
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
tr_len = len(train)
df = train.drop('Survived', axis=1).append(test)
We'll extract title information from the Name
feature, and then merge some of the titles together.
In [46]:
df['Title'] = df['Name'].str.extract('\,\s(.*?)[.]', expand=False)
df['Title'].replace('Mme', 'Mrs', inplace=True)
df['Title'].replace('Mlle', 'Miss', inplace=True)
df['Title'].replace('Ms', 'Miss', inplace=True)
df['Title'].replace('Lady', 'fNoble', inplace=True)
df['Title'].replace('the Countess', 'fNoble', inplace=True)
df['Title'].replace('Dona', 'fNoble', inplace=True)
df['Title'].replace('Don', 'mNoble', inplace=True)
df['Title'].replace('Sir', 'mNoble', inplace=True)
df['Title'].replace('Jonkheer', 'mNoble', inplace=True)
df['Title'].replace('Col', 'mil', inplace=True)
df['Title'].replace('Capt', 'mil', inplace=True)
df['Title'].replace('Major', 'mil', inplace=True)
In [47]:
df['FamSize'] = df['SibSp'] + df['Parch'] + 1
In [48]:
df['TicketSize'] = df['Ticket'].value_counts()[df['Ticket']].values
df['AdjFare'] = df['Fare'].div(df['TicketSize'])
df['AdjFare'] = df.groupby('Pclass')['AdjFare'].apply(lambda x: x.fillna(x.median()))
In [49]:
df['FilledAge'] = df.groupby(['Sex', 'Title'])['Age'].apply(lambda x: x.fillna(x.median()))
In [50]:
df['Embarked'].fillna('S', inplace=True)
In [52]:
df['CabinKnown'] = df['Cabin'].notnull().astype(int)
In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
pdf = df.copy()
le = LabelEncoder()
pdf['Sex'] = le.fit_transform(pdf['Sex'])
pdf['Embarked'] = le.fit_transform(pdf['Embarked'])
pdf['Title'] = le.fit_transform(pdf['Title'])
pdf.drop(['CabinKnown', 'Embarked'], axis=1, inplace=True)
p_test = pdf[tr_len:]
p_train = pdf[:tr_len].join(train[['Survived']]).drop(['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(p_train.drop('Survived', axis=1), p_train['Survived'], random_state=236)
clf = RandomForestClassifier(n_estimators=1000, max_depth=7, max_features=4)
clf.fit(X_train, y_train)
print('CV Score: {}'.format(clf.score(X_test, y_test)))
pd.Series(clf.feature_importances_, index=X_train.columns)
Out[77]:
In [78]:
df.info()