In [1]:
import numpy
import pandas
In [3]:
def gini_index(groups, classes):
# count all samples at split point
n_instances = float(sum([len(group) for group in groups]))
# sum weighted Gini index for each group
gini = 0.0
for group in groups:
size = float(len(group))
# avoid divide by zero
if size == 0:
continue
score = 0.0
# score the group based on the score for each class
for class_val in classes:
p = [row[-1] for row in group].count(class_val) / size
score += p * p
# weight the group score by its relative size
gini += (1.0 - score) * (size / n_instances)
return gini
In [6]:
def test_split(index, value, dataset):
left, right = [], []
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
In [7]:
pwd
Out[7]:
In [ ]:
pandas.read_csv('data')