In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.tree import export_graphviz
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
In [2]:
# Get Data
df = pd.read_excel('titanic_data.xls')
df.tail()
Out[2]:
In [64]:
# select columns
feature_names = ["pclass", "age", "sex"]
target_name = ["survived"]
df = df[feature_names + target_name].reset_index(drop=True)
# remove Nan data
df.dropna(axis=0, how="any", inplace=True)
# split feature & target
dfX = df[feature_names].reset_index(drop=True)
dfy = df[target_name]
# fill NaN data
# dfX.age.fillna(int(dfX.age.mean()), inplace=True)
df.tail()
Out[64]:
In [65]:
# LabelEncoder : change char category to number
dfX.sex = LabelEncoder().fit_transform(dfX.sex)
dfX.tail()
Out[65]:
In [66]:
# OneHotEncoding : change Pclass data to three columns
dfX2 = pd.DataFrame(OneHotEncoder().fit_transform(dfX["pclass"].as_matrix()[:,np.newaxis]).toarray(),
columns=['first_class', 'second_class', 'third_class'], index=dfX.index)
dfX = pd.concat([dfX, dfX2], axis=1)
del(dfX["pclass"])
dfX.tail()
Out[66]:
In [113]:
# split trainning data set and test data set
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.2, random_state=1)
# trainning
# min_samples_leaf = stop the number of result under min_samples_leaf before max_depth
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5).fit(X_train, y_train)
In [117]:
# result
print("accuracy score : {}".format(accuracy_score(y_test, model.predict(X_test))))
print(confusion_matrix(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))
In [125]:
# age : 26, sax : female, Pclass : first_class
predict_test1 = np.array([26,0,1,0,0]).reshape(1, -1)
# age : 53, sax : male, Pclass : third_class
predict_test2 = np.array([53,1,0,0,1]).reshape(1, -1)
print(model.predict(predict_test1)) # servive
print(model.predict(predict_test2)) # no servive
print(model.predict_proba(predict_test1))
print(model.predict_proba(predict_test2))