In [1]:
# https://www.kaggle.com/omarelgabry/a-journey-through-titanic
In [2]:
# Imports
# pandas
import pandas as pd
from pandas import Series, DataFrame
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
In [3]:
# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')
# preview the data
titanic_df.head()
Out[3]:
In [4]:
titanic_df.info()
print("--------------------------")
test_df.info()
In [5]:
# drop unnecessary columns, these columns won't be useful
# in analysis and prediction
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
test_df = test_df.drop(['Name', 'Ticket'], axis=1)
In [6]:
# Embarked
# only in titanic_df, fill the two missing values
# with the most occurred value, Which is 'S'
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")
# plot
sns.factorplot('Embarked', 'Survived', data=titanic_df, size=4, aspect=3)
fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize=(15,5))
sns.factorplot('Embarked', data=titanic_df, kind='count',
order=['S', 'C', 'Q'], ax=axis1)
sns.factorplot('Survived', hue='Embarked', data=titanic_df, kind='count',
order=[1,0], ax=axis2)
sns.countplot(x='Embarked', data=titanic_df, ax=axis1)
sns.countplot(x='Survived', hur='Embarked', data=titanic_df, order=[1, 0],
ax=axis2)
# group by embarked, and get the mean for suvived passengers
# for each value in Embarked
embark_perc = titanic_df[["Embarked", "Survived"]].groupby(
['Embarked', as_index=False]).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,
order=['S', 'C', 'Q'], ax=axis3)
# Either to consider Embarked column in predictions,
# and remove "S" dummy variable,
# and leave "C" & "Q", since they seem to have a good rate for Survival.
# OR, don't create dummy variables for Embarked column, just drop it,
# because logically, Embarked doesn't seem to be useful in prediction.
In [ ]: