In [1]:
import numpy
import pandas
In [2]:
def gini_index(groups, classes):
# count all samples at split point
n_instances = float(sum([len(group) for group in groups]))
# sum weighted Gini index for each group
gini = 0.0
for group in groups:
size = float(len(group))
# avoid divide by zero
if size == 0:
continue
score = 0.0
# score the group based on the score for each class
for class_val in classes:
p = [row[-1] for row in group].count(class_val) / size
score += p * p
# weight the group score by its relative size
gini += (1.0 - score) * (size / n_instances)
return gini
In [27]:
def test_split(index, value, dataset):
left, right = [], []
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
In [4]:
pwd
Out[4]:
In [5]:
df = pandas.read_csv('data_banknote_authentication.txt', header=None)
In [6]:
df.head()
Out[6]:
In [7]:
df.columns = ['X{}'.format(i) for i in range(4)] + ['y']
In [8]:
df.head()
Out[8]:
In [30]:
def get_split(dataset):
class_values = list(set(row[-1] for row in dataset))
best_index = 999
best_value = 999
best_score = 999
best_groups = None
gini_hist = []
for index in range(len(dataset[0]) - 1):
for row in dataset:
groups = test_split(index, row[index], dataset)
gini = gini_index(groups, class_values)
#print('X{} < {:.3f} Gini={:.3f}'.format(index, row[index], gini))
if gini < best_score:
best_index = index
best_value = row[index]
best_score = gini
best_groups = groups
gini_hist.append(gini)
return {'index': best_index, 'value': best_value, 'groups':best_groups, 'hist': gini_hist}
In [31]:
def to_terminal(group):
outcomes = [row[-1] for row in group]
return max(set(outcomes), key=outcomes.count)
In [32]:
def split(node, max_depth, min_size, depth):
left, right = node['groups']
del(node['groups'])
if not left or not right:
node['left'] = node['right'] = to_terminal(left + right)
return
if depth >= max_depth:
node['left'], node['right'] = to_terminal(left), to_terminal(right)
return
if len(left) <= min_size:
node['left'] = to_terminal(left)
else:
node['left'] = get_split(left)
split(node['left'], max_depth, min_size, depth+1)
if len(right) <= min_size:
node['right'] = to_terminal(right)
else:
node['right'] = get_split(right)
split(node['right'], max_depth, min_size, depth+1)
In [33]:
def build_tree(train, max_depth, min_size):
root = get_split(train)
split(root, max_depth, min_size, 1)
return root
In [44]:
def print_tree(node, depth=0):
if isinstance(node, dict):
print('{spacer}[X{:d} < {:.3f}]'.format(node['index'], node['value'], spacer=depth*' '))
print_tree(node['left'], depth+1)
print_tree(node['right'], depth+1)
else:
print('{spacer}[{}]'.format(node, spacer=depth*' '))
In [38]:
tree = build_tree(df.values, 3, 1)
In [45]:
print_tree(tree)
In [17]:
dataset
In [19]:
numpy.unique(df.values[:, -1])
Out[19]:
In [46]:
def predict(node, row):
if row[node['index']] < node['value']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
In [60]:
import numpy
In [49]:
def decision_tree(train, test, max_depth, min_size):
tree = build_tree(train, max_depth, min_size)
predictions = []
for row in test:
prediction = predict(tree, row)
predictions.append(prediction)
return predictions
In [63]:
traintest = df.values
In [65]:
numpy.random.shuffle(traintest)
In [66]:
len(traintest)
Out[66]:
In [67]:
train_ratio = 0.8
In [71]:
train_samples = int(len(traintest) * train_ratio)
train = traintest[:train_samples]
In [72]:
test= traintest[train_samples:]
In [73]:
assert len(test) + len(train) == len(traintest)
In [74]:
preds = decision_tree(train, test, 3, 1)
In [76]:
from sklearn.tree import DecisionTreeClassifier
In [78]:
train
Out[78]:
In [98]:
clf = DecisionTreeClassifier(min_samples_leaf=1, max_depth=2)
clf.fit(train[:, :-1], train[:, -1])
Out[98]:
In [99]:
pred_sk = clf.predict(test[:,:-1])
In [100]:
import matplotlib.pyplot as plt
plt.scatter(preds, pred_sk)
plt.show()
In [ ]:
In [104]:
list(zip(preds, pred_sk)).count((0,0))
Out[104]:
In [105]:
list(zip(preds, pred_sk)).count((0,1))
Out[105]:
In [106]:
list(zip(preds, pred_sk)).count((1,0))
Out[106]:
In [107]:
list(zip(preds, pred_sk)).count((1,1))
Out[107]:
In [81]:
train[:, :-1]
Out[81]:
In [62]:
traintest
Out[62]:
In [56]:
df.values
Out[56]:
In [53]:
train
In [50]:
shuffle(df.values)
In [ ]: