In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

from sklearn.tree import export_graphviz
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
# Get Data
df = pd.read_excel('titanic_data.xls')
df.tail()


Out[2]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
1304 3 0 Zabour, Miss. Hileni female 14.5 1 0 2665 14.4542 NaN C NaN 328.0 NaN
1305 3 0 Zabour, Miss. Thamine female NaN 1 0 2665 14.4542 NaN C NaN NaN NaN
1306 3 0 Zakarian, Mr. Mapriededer male 26.5 0 0 2656 7.2250 NaN C NaN 304.0 NaN
1307 3 0 Zakarian, Mr. Ortin male 27.0 0 0 2670 7.2250 NaN C NaN NaN NaN
1308 3 0 Zimmerman, Mr. Leo male 29.0 0 0 315082 7.8750 NaN S NaN NaN NaN

In [64]:
# select columns
feature_names = ["pclass", "age", "sex"]
target_name = ["survived"]
df = df[feature_names + target_name].reset_index(drop=True)

# remove Nan data
df.dropna(axis=0, how="any", inplace=True)

# split feature & target
dfX = df[feature_names].reset_index(drop=True)
dfy = df[target_name]

# fill NaN data
# dfX.age.fillna(int(dfX.age.mean()), inplace=True)

df.tail()


Out[64]:
pclass age sex survived
1041 3 45.5 male 0
1042 3 14.5 female 0
1043 3 26.5 male 0
1044 3 27.0 male 0
1045 3 29.0 male 0

In [65]:
# LabelEncoder : change char category to number
dfX.sex = LabelEncoder().fit_transform(dfX.sex)
dfX.tail()


Out[65]:
pclass age sex
1041 3 45.5 1
1042 3 14.5 0
1043 3 26.5 1
1044 3 27.0 1
1045 3 29.0 1

In [66]:
# OneHotEncoding : change Pclass data to three columns
dfX2 = pd.DataFrame(OneHotEncoder().fit_transform(dfX["pclass"].as_matrix()[:,np.newaxis]).toarray(), 
                    columns=['first_class', 'second_class', 'third_class'], index=dfX.index)
dfX = pd.concat([dfX, dfX2], axis=1)
del(dfX["pclass"])
dfX.tail()


Out[66]:
age sex first_class second_class third_class
1041 45.5 1 0.0 0.0 1.0
1042 14.5 0 0.0 0.0 1.0
1043 26.5 1 0.0 0.0 1.0
1044 27.0 1 0.0 0.0 1.0
1045 29.0 1 0.0 0.0 1.0

In [113]:
# split trainning data set and test data set
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.2, random_state=1)

# trainning
# min_samples_leaf = stop the number of result under min_samples_leaf before max_depth
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5).fit(X_train, y_train)

In [117]:
# result
print("accuracy score : {}".format(accuracy_score(y_test, model.predict(X_test))))
print(confusion_matrix(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))


accuracy score : 0.8380952380952381
[[110  13]
 [ 21  66]]
             precision    recall  f1-score   support

          0       0.84      0.89      0.87       123
          1       0.84      0.76      0.80        87

avg / total       0.84      0.84      0.84       210


In [125]:
# age : 26, sax : female, Pclass : first_class
predict_test1 = np.array([26,0,1,0,0]).reshape(1, -1)

# age : 53, sax : male, Pclass : third_class
predict_test2 = np.array([53,1,0,0,1]).reshape(1, -1)

print(model.predict(predict_test1)) # servive
print(model.predict(predict_test2)) # no servive
print(model.predict_proba(predict_test1))
print(model.predict_proba(predict_test2))


[1]
[0]
[[ 0.03669725  0.96330275]]
[[ 0.91764706  0.08235294]]