In [3]:
from sklearn import tree
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from IPython.display import display
from IPython.display import Image
import pydotplus
In [4]:
from treeThinking import writeTreeYaml
Let's quickly walk through the Titanic dataset as an example of a treeThinking usage.
The Titanic dataset is a famous example of decision trees and random forests, so we'll work with it for familiarity. The data set requires a little cleaning. We follow a similar approach to this guide
In [5]:
titanic_df = pd.read_csv('../titanic.csv', index_col=None, na_values=['NA'])
In [6]:
titanic_df.head()
Out[6]:
In [7]:
titanic_df = titanic_df.drop(['body','cabin','boat'], axis=1)
titanic_df["home.dest"] = titanic_df["home.dest"].fillna("NA")
In [8]:
titanic_df = titanic_df.dropna()
In [9]:
from sklearn import preprocessing, cross_validation
def preprocess_titanic_df(df):
processed_df = df.copy()
le = preprocessing.LabelEncoder()
processed_df.sex = le.fit_transform(processed_df.sex)
processed_df.embarked = le.fit_transform(processed_df.embarked)
processed_df = processed_df.drop(['name','ticket','home.dest'],axis=1)
return processed_df
In [10]:
processed_df = preprocess_titanic_df(titanic_df)
processed_df.head()
Out[10]:
In [11]:
X = processed_df.drop(['survived'], axis=1).values
y = processed_df['survived'].values
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
In [12]:
clf_dt = tree.DecisionTreeClassifier(max_depth=4)
clf_dt.fit (X_train, y_train)
clf_dt.score (X_test, y_test)
Out[12]:
In [13]:
feats = list(processed_df.drop(['survived'], axis=1).columns)
In [14]:
def draw_tree(dat_tree, data_cols, label_cols=['False','True']):
dot_data = tree.export_graphviz(dat_tree, out_file=None,
feature_names=list(data_cols),
class_names=(label_cols),
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))
In [15]:
draw_tree(clf_dt, feats)
In [16]:
yaml_tree = writeTreeYaml(clf_dt, feats, 'survived')
print yaml_tree
In [17]:
from treeThinking import yamlSwitch
import yaml
In [18]:
tree_dict = yaml.load(yaml_tree)
predictions = []
for v in X_test:
predictions.append(yamlSwitch(v, tree_dict['nodes'], tree_dict['class_name'])[1])
# float(sum([(abs(x[0]-x[1])) for x in zip(predictions,y_test)]))/len(predictions)
In [1]:
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
estimator = DecisionTreeClassifier(max_leaf_nodes=5, random_state=0)
estimator.fit(X_train, y_train)
In [2]:
print writeTreeYaml(estimator, iris.feature_names, "label")
In [72]:
draw_tree(estimator, X_train, iris.target_names)