In [1]:
import os
import pandas as pd
import math
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [2]:
headers = ["buying", "maint", "doors", "persons","lug_boot", "safety", "class"]
data = pd.read_csv("car_data.csv", header=None, names=headers)

data = data.sample(frac=1).reset_index(drop=True) # shuffle

In [3]:
for h in headers:
    data[h] = data[h].astype('category')
    data[h] = data[h].cat.codes

data.set_index("class", inplace=True)
data.head()


Out[3]:
buying maint doors persons lug_boot safety
class
2 1 1 3 0 2 0
2 1 2 0 2 2 0
2 0 1 0 2 1 1
0 1 0 0 1 2 2
2 0 1 2 0 2 1

In [4]:
size = len(data)
train_size = int(math.floor(size * 0.7))
train_data = data[:train_size]
test_data = data[train_size:]

In [5]:
#Usando critério gini
d_tree = DecisionTreeClassifier(criterion="gini")
d_tree.fit(train_data, train_data.index)


Out[5]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [6]:
d_tree.predict(test_data.iloc[:, 0:6])
d_tree.score(test_data, test_data.index)


Out[6]:
0.96917148362235073

In [7]:
d_tree2 = DecisionTreeClassifier(criterion="entropy")
d_tree2.fit(train_data, train_data.index)


Out[7]:
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [8]:
d_tree2.predict(test_data.iloc[:, 0:6])
d_tree2.score(test_data, test_data.index)


Out[8]:
0.97687861271676302

In [ ]: