In [1]:
import folium
import graphviz
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
pd.set_option('display.max_rows', 10)
%matplotlib inline
In [5]:
titanic = sns.load_dataset('titanic')
titanic = titanic.rename(index=lambda i: f'p_{i}')
titanic
Out[5]:
In [6]:
titanic_class = titanic[["sex", "embark_town", "age", "class"]]
In [7]:
sum(titanic_class["embark_town"].isnull())
Out[7]:
In [8]:
titanic_class = titanic_class[(titanic_class["embark_town"].notnull()) & (titanic_class["age"].notnull())]
In [9]:
sns.countplot(x="sex", hue="class", data=titanic_class)
Out[9]:
In [10]:
sns.countplot(x="embark_town", hue="class", data=titanic_class)
Out[10]:
In [11]:
sns.catplot(x="class", y="age", kind="box", data=titanic_class)
Out[11]:
In [12]:
titanic_pre = pd.concat([titanic_class[["age", "class"]], pd.get_dummies(titanic_class[["sex", "embark_town"]])], axis=1)
titanic_pre
Out[12]:
In [13]:
X = titanic_pre.drop("class", axis=1)
y = titanic_pre[["class"]]
In [14]:
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X, y)
print(X.columns.values)
print(tree_clf.feature_importances_)
In [15]:
print(np.vstack((X.columns.values, tree_clf.feature_importances_)).T)
In [16]:
export_graphviz(
tree_clf,
out_file="tree.dot",
feature_names=X.columns,
class_names=tree_clf.classes_,
rounded=True,
filled=True
)
In [17]:
with open("tree.dot") as f:
dot_graph = f.read()
dot = graphviz.Source(dot_graph)
dot.format = 'png'
dot.render(filename='tree', directory='images/decision_trees', cleanup=True)
dot
Out[17]:
In [18]:
folium.Map(location=(49.634357, -1.622649), zoom_start=6)
Out[18]: