In [3]:

    
from sklearn import tree
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

from IPython.display import display
from IPython.display import Image
import pydotplus



In [4]:

    
from treeThinking import writeTreeYaml

The Titanic Dataset

Let's quickly walk through the Titanic dataset as an example of a treeThinking usage.

The Titanic dataset is a famous example of decision trees and random forests, so we'll work with it for familiarity. The data set requires a little cleaning. We follow a similar approach to this guide



In [5]:

    
titanic_df = pd.read_csv('../titanic.csv', index_col=None, na_values=['NA'])



In [6]:

    
titanic_df.head()









    Out[6]:






  
    
      
      pclass
      survived
      name
      sex
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
      boat
      body
      home.dest
    
  
  
    
      0
      1
      1
      Allen, Miss. Elisabeth Walton
      female
      29.0000
      0
      0
      24160
      211.3375
      B5
      S
      2
      NaN
      St Louis, MO
    
    
      1
      1
      1
      Allison, Master. Hudson Trevor
      male
      0.9167
      1
      2
      113781
      151.5500
      C22 C26
      S
      11
      NaN
      Montreal, PQ / Chesterville, ON
    
    
      2
      1
      0
      Allison, Miss. Helen Loraine
      female
      2.0000
      1
      2
      113781
      151.5500
      C22 C26
      S
      NaN
      NaN
      Montreal, PQ / Chesterville, ON
    
    
      3
      1
      0
      Allison, Mr. Hudson Joshua Creighton
      male
      30.0000
      1
      2
      113781
      151.5500
      C22 C26
      S
      NaN
      135.0
      Montreal, PQ / Chesterville, ON
    
    
      4
      1
      0
      Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
      female
      25.0000
      1
      2
      113781
      151.5500
      C22 C26
      S
      NaN
      NaN
      Montreal, PQ / Chesterville, ON



In [7]:

    
titanic_df = titanic_df.drop(['body','cabin','boat'], axis=1)
titanic_df["home.dest"] = titanic_df["home.dest"].fillna("NA")



In [8]:

    
titanic_df = titanic_df.dropna()



In [9]:

    
from sklearn import preprocessing, cross_validation
def preprocess_titanic_df(df):
    processed_df = df.copy()
    le = preprocessing.LabelEncoder()
    processed_df.sex = le.fit_transform(processed_df.sex)
    processed_df.embarked = le.fit_transform(processed_df.embarked)
    processed_df = processed_df.drop(['name','ticket','home.dest'],axis=1)
    return processed_df









    



/Library/Python/2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [10]:

    
processed_df = preprocess_titanic_df(titanic_df)
processed_df.head()



In [11]:

    
X = processed_df.drop(['survived'], axis=1).values
y = processed_df['survived'].values
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)



In [12]:

    
clf_dt = tree.DecisionTreeClassifier(max_depth=4)
clf_dt.fit (X_train, y_train)
clf_dt.score (X_test, y_test)









    Out[12]:





0.79904306220095689



In [13]:

    
feats = list(processed_df.drop(['survived'], axis=1).columns)

The Trees

Let's produce our trees in graphical form, and in our nice YAML.



In [14]:

    
def draw_tree(dat_tree, data_cols, label_cols=['False','True']):
    dot_data = tree.export_graphviz(dat_tree, out_file=None, 
                         feature_names=list(data_cols),  
                         class_names=(label_cols),  
                         filled=True, rounded=True,  
                         special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data)  
    display(Image(graph.create_png()))



In [15]:

    
draw_tree(clf_dt, feats)



In [16]:

    
yaml_tree = writeTreeYaml(clf_dt, feats, 'survived')
print yaml_tree









    



---
class_name: 'survived'
features:
  - name: 'pclass'
  - name: 'sex'
  - name: 'age'
  - name: 'sibsp'
  - name: 'parch'
  - name: 'fare'
  - name: 'embarked'
nodes:
  feature_idx: 1
  thr: 0.5
  results: 
    true: 
      feature_idx: 0
      thr: 2.5
      results: 
        true: 
          feature_idx: 5
          thr: 32.0895996094
          results: 
            true: 
              feature_idx: 5
              thr: 31.3395996094
              results: 
                true: 
                  prob: 0.87012987013
                false: 
                  prob: 0.0
            false: 
              feature_idx: 2
              thr: 7.0
              results: 
                true: 
                  prob: 0.5
                false: 
                  prob: 0.989898989899
        false: 
          feature_idx: 6
          thr: 0.5
          results: 
            true: 
              feature_idx: 2
              thr: 41.5
              results: 
                true: 
                  prob: 0.933333333333
                false: 
                  prob: 0.5
            false: 
              feature_idx: 5
              thr: 17.25
              results: 
                true: 
                  prob: 0.5
                false: 
                  prob: 0.153846153846
    false: 
      feature_idx: 2
      thr: 9.5
      results: 
        true: 
          feature_idx: 3
          thr: 2.5
          results: 
            true: 
              feature_idx: 2
              thr: 0.375
              results: 
                true: 
                  prob: 0.0
                false: 
                  prob: 0.916666666667
            false: 
              feature_idx: 4
              thr: 1.5
              results: 
                true: 
                  prob: 0.0
                false: 
                  prob: 0.2
        false: 
          feature_idx: 0
          thr: 1.5
          results: 
            true: 
              feature_idx: 2
              thr: 53.5
              results: 
                true: 
                  prob: 0.410526315789
                false: 
                  prob: 0.0909090909091
            false: 
              feature_idx: 2
              thr: 32.25
              results: 
                true: 
                  prob: 0.152727272727
                false: 
                  prob: 0.0660377358491

Parsing and using the YAML

Now, for example we can read in this YAML and parse it.



In [17]:

    
from treeThinking import yamlSwitch
import yaml



In [18]:

    
tree_dict = yaml.load(yaml_tree)
predictions = []
for v in X_test:
    predictions.append(yamlSwitch(v, tree_dict['nodes'], tree_dict['class_name'])[1])
# float(sum([(abs(x[0]-x[1])) for x in zip(predictions,y_test)]))/len(predictions)

The Iris Dataset

The other classical example of decision trees is for the Iris dataset. However, Iris requires multi-label classification("setosa", "virginica", "versicolor").



In [1]:

    
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

estimator = DecisionTreeClassifier(max_leaf_nodes=5, random_state=0)
estimator.fit(X_train, y_train)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-54f6f10962e6> in <module>()
----> 1 iris = load_iris()
      2 X = iris.data
      3 y = iris.target
      4 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
      5 

NameError: name 'load_iris' is not defined



In [2]:

    
print writeTreeYaml(estimator, iris.feature_names, "label")









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-30ed657a7dcd> in <module>()
----> 1 print writeTreeYaml(estimator, iris.feature_names, "label")

NameError: name 'writeTreeYaml' is not defined



In [72]:

    
draw_tree(estimator, X_train, iris.target_names)

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON