First class of Titanic

작성자: 박주홍

문제 정의

  • First Class 는 누가 타나요?
    • 타이타닉 데이터를 Decition Tree 에 적용
    • First Class 탑승자 Node 의 찾기
    • 위 Node 까지 도달하기 위한 Major Feature 찾기

Setting


In [1]:
import folium
import graphviz
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz

pd.set_option('display.max_rows', 10)
%matplotlib inline

Read Data


In [5]:
titanic = sns.load_dataset('titanic')
titanic = titanic.rename(index=lambda i: f'p_{i}')
titanic


Out[5]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
p_0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
p_1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
p_2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
p_3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
p_4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
p_886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
p_887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
p_888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
p_889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
p_890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

891 rows × 15 columns

Preprocessing


In [6]:
titanic_class = titanic[["sex", "embark_town", "age", "class"]]

In [7]:
sum(titanic_class["embark_town"].isnull())


Out[7]:
2

In [8]:
titanic_class = titanic_class[(titanic_class["embark_town"].notnull()) & (titanic_class["age"].notnull())]

In [9]:
sns.countplot(x="sex", hue="class", data=titanic_class)


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1d7b1f28>

In [10]:
sns.countplot(x="embark_town", hue="class", data=titanic_class)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x110acc048>

In [11]:
sns.catplot(x="class", y="age", kind="box", data=titanic_class)


Out[11]:
<seaborn.axisgrid.FacetGrid at 0x110aa78d0>

In [12]:
titanic_pre = pd.concat([titanic_class[["age", "class"]], pd.get_dummies(titanic_class[["sex", "embark_town"]])], axis=1)
titanic_pre


Out[12]:
age class sex_female sex_male embark_town_Cherbourg embark_town_Queenstown embark_town_Southampton
p_0 22.0 Third 0 1 0 0 1
p_1 38.0 First 1 0 1 0 0
p_2 26.0 Third 1 0 0 0 1
p_3 35.0 First 1 0 0 0 1
p_4 35.0 Third 0 1 0 0 1
... ... ... ... ... ... ... ...
p_885 39.0 Third 1 0 0 1 0
p_886 27.0 Second 0 1 0 0 1
p_887 19.0 First 1 0 0 0 1
p_889 26.0 First 0 1 1 0 0
p_890 32.0 Third 0 1 0 1 0

712 rows × 7 columns

Modeling


In [13]:
X = titanic_pre.drop("class", axis=1)
y = titanic_pre[["class"]]

In [14]:
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X, y)
print(X.columns.values)
print(tree_clf.feature_importances_)


['age' 'sex_female' 'sex_male' 'embark_town_Cherbourg'
 'embark_town_Queenstown' 'embark_town_Southampton']
[0.66668294 0.         0.         0.33331706 0.         0.        ]

In [15]:
print(np.vstack((X.columns.values, tree_clf.feature_importances_)).T)


[['age' 0.6666829382620311]
 ['sex_female' 0.0]
 ['sex_male' 0.0]
 ['embark_town_Cherbourg' 0.333317061737969]
 ['embark_town_Queenstown' 0.0]
 ['embark_town_Southampton' 0.0]]

In [16]:
export_graphviz(
        tree_clf,
        out_file="tree.dot",
        feature_names=X.columns,
        class_names=tree_clf.classes_,
        rounded=True,
        filled=True
    )

In [17]:
with open("tree.dot") as f:
    dot_graph = f.read()
dot = graphviz.Source(dot_graph)
dot.format = 'png'
dot.render(filename='tree', directory='images/decision_trees', cleanup=True)
dot


Out[17]:
Tree 0 age <= 34.75 gini = 0.626 samples = 712 value = [184, 173, 355] class = Third 1 embark_town_Cherbourg <= 0.5 gini = 0.558 samples = 479 value = [73, 120, 286] class = Third 0->1 True 8 embark_town_Cherbourg <= 0.5 gini = 0.634 samples = 233 value = [111, 53, 69] class = First 0->8 False 2 age <= 22.5 gini = 0.524 samples = 397 value = [42, 106, 249] class = Third 1->2 5 age <= 15.5 gini = 0.624 samples = 82 value = [31, 14, 37] class = Third 1->5 3 gini = 0.444 samples = 193 value = [19, 36, 138] class = Third 2->3 4 gini = 0.573 samples = 204 value = [23, 70, 111] class = Third 2->4 6 gini = 0.278 samples = 18 value = [0, 3, 15] class = Third 5->6 7 gini = 0.618 samples = 64 value = [31, 11, 22] class = First 5->7 9 age <= 44.5 gini = 0.662 samples = 185 value = [68, 52, 65] class = First 8->9 12 age <= 45.75 gini = 0.19 samples = 48 value = [43, 1, 4] class = First 8->12 10 gini = 0.647 samples = 98 value = [27, 27, 44] class = Third 9->10 11 gini = 0.637 samples = 87 value = [41, 25, 21] class = First 9->11 13 gini = 0.355 samples = 23 value = [18, 1, 4] class = First 12->13 14 gini = 0.0 samples = 25 value = [25, 0, 0] class = First 12->14


In [18]:
folium.Map(location=(49.634357, -1.622649), zoom_start=6)


Out[18]: