In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [268]:
def Ent(df, feature):
'''Parameters:
df: dataframe
feature: the column index
return: entropy
'''
entropy = 0
p = 0
items = df.loc[:, feature].unique()
for item in items:
p = float(len(df[df[feature] == item]))/len(df)
entropy += -p*(np.log(p)/np.log(2))
return entropy
In [269]:
df = pd.DataFrame({'No_surfacing': [1,1,1,0,0], 'Flippers': [1,1,0,1,1], 'Class': [1,1,0,0,0]})
In [270]:
df
Out[270]:
In [271]:
Ent(df,'Class')
Out[271]:
In [272]:
def splitDataset(df, feature, value):
'''parameters:
df: dataframe
feature: column index
value: column value
return:
df1: dataframe, a subset of df
'''
df1 = df[df[feature] == value].drop(feature, 1)
return df1
In [273]:
def splitDataset_by_IG(df):
'''parameter:
df: dataframe
returns:
bestIG: the biggest IG
bestFeature: according the bestIG
bestsubset: according the bestIG
'''
features = df.columns[1:]
print 'features:'+features
values = []
baseEntropy = Ent(df, 'Class')
bestIG = 0.0
bestFeature = None
for feature in features:
values = df[feature].unique()
NewEntropy = 0.0
for value in values:
print 'the values use to split:'+str(value)
subset = splitDataset(df, feature, value)
p = float(float(len(subset))/len(df))
NewEntropy += p*Ent(subset, 'Class')
IG = baseEntropy - NewEntropy
if IG > bestIG :
bestIG = IG
bestFeature = feature
bestSubset = subset
return bestIG, bestFeature, bestSubset
In [274]:
def majorityChoice(df, feature='Class'):
'''
parameters:
df: dataframe
feature: 'Class'
return:
class value
'''
classcount = df[feature].value_counts().sort_index()
if classcount[0] >= classcount[1]:
return classcount.index.values[0]
else:
return classcount.index.values[1]
In [275]:
def createTree(df):
'''
df: dataframe
myTree: a dict of Tree
'''
classList = df['Class']
if len(classList.unique()) == 1:
return classList.values[0]
if len(df.columns) == 1:
return majorityChoice(df, 'Class')
bestIG, bestFeature, bestSubset = splitDataset_by_IG(df)
myTree = {bestFeature:{}}
uniqueValues = df[bestFeature].unique()
for value in uniqueValues:
myTree[bestFeature][value] = createTree(splitDataset(df, bestFeature, value))
return myTree
In [276]:
myTree = createTree(df)
In [277]:
myTree
Out[277]:
In [ ]:
In [ ]:
In [ ]: