In [2]:
import numpy as np
import pandas as pd
In [83]:
my_data=[['slashdot','USA','yes',18,'None'],
['google','France','yes',23,'Premium'],
['digg','USA','yes',24,'Basic'],
['kiwitobes','France','yes',23,'Basic'],
['google','UK','no',21,'Premium'],
['(direct)','New Zealand','no',12,'None'],
['(direct)','UK','no',21,'Basic'],
['google','USA','no',24,'Premium'],
['slashdot','France','yes',19,'None'],
['digg','USA','no',18,'None'],
['google','UK','no',18,'None'],
['kiwitobes','UK','no',19,'None'],
['digg','New Zealand','yes',12,'Basic'],
['slashdot','UK','no',21,'None'],
['google','UK','yes',18,'Basic'],
['kiwitobes','France','yes',19,'Basic']]
In [84]:
df = pd.DataFrame(my_data)
In [85]:
df = df.rename(columns = {0:'Referer', 1:'Country', 2:'Read', 3:'Visit', 4:'Subscription'})
In [86]:
# define nodes class
class decisionnode:
def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
self.col=col
self.value=value
self.results=results
self.tb=tb
self.fb=fb
In [87]:
def divideset(df, column, value):
"""according value to divide sets into two parts
"""
if isinstance(value,int) or isinstance(value,float):
set1 = df[df[column] >= value]
set2 = df[df[column] < value]
else:
set1 = df[df[column] == value]
set2 = df[df[column] != value]
return (set1,set2)
In [88]:
def uniquecounts(df, column='Subscription'):
'''count the different class
'''
results = {}
results_list = df[column].unique()
for item in results_list:
results[item] = len(df[df[column] == item])
return results
In [89]:
uniquecounts(df, 'Subscription')
Out[89]:
In [90]:
uniquecounts(divideset(df, 'Visit', 20)[0])
Out[90]:
In [91]:
uniquecounts(divideset(df, 'Visit', 20)[1], 'Subscription')
Out[91]:
In [92]:
def entropy(df, column='Subscription'):
'''calculate the entropy
'''
log2=lambda x:np.log(x)/np.log(2)
results=uniquecounts(df, column)
# Now calculate the entropy
ent=0.0
for r in results.keys():
p=float(results[r])/len(df)
ent=ent-p*log2(p)
return ent
In [93]:
df1, df2 = divideset(df, 'Visit', 20)
entropy(df1, 'Subscription'), entropy(df2, 'Subscription')
Out[93]:
In [94]:
entropy(df, 'Subscription')
Out[94]:
In [95]:
class decisionnode:
def __init__(self, col='Subscription', value=None, results=None, tb=None, fb=None):
self.col = col
self.value = value
self.results = results
self.tb = tb
self.fb = fb
In [98]:
def buildtree(df, scoref=entropy, column='Subscription'):
if len(df)==0: # check the items number of the set
return decisionnode()
current_score = scoref(df) # calculate the current entropy
# initialization
best_gain = 0.0
best_criteria = None
best_sets = None
columns = df.columns[0:-1] # pick up the features
# calculate the best information gain
for col in columns:
global column_values
column_values = df.loc[:, col].values
for value in column_values:
(set1,set2) = divideset(df, col, value)
# Information gain
p = float(len(set1))/len(df) #p is the size of a child set relative to its parent
gain = current_score - p*scoref(set1) - (1 - p)*scoref(set2) #cf. formula information gain
if gain > best_gain and len(set1) > 0 and len(set2) > 0: #set must not be empty
best_gain = gain
best_criteria = (col,value)
best_sets = (set1,set2)
if best_gain > 0:
trueBranch = buildtree(best_sets[0])
falseBranch = buildtree(best_sets[1])
return decisionnode(col=best_criteria[0], value=best_criteria[1],
tb=trueBranch, fb=falseBranch)
else:
return decisionnode(results=uniquecounts(df))
In [99]:
tree = buildtree(df)
In [100]:
print(tree.col)
print(tree.value)
print(tree.value)
print("")
print(tree.tb.col)
print(tree.tb.value)
print(tree.tb.results)
print("")
print(tree.tb.tb.col)
print(tree.tb.tb.value)
print(tree.tb.tb.results)
print("")
print(tree.tb.fb.col)
print(tree.tb.fb.value)
print(tree.tb.fb.results)
In [108]:
def printtree(tree,indent=''):
if tree.results!=None:
print str(tree.results)
else:
print str(tree.col)+':'+str(tree.value)+'? '
print indent+'T->',
printtree(tree.tb,indent+' ')
print indent+'F->',
printtree(tree.fb,indent+' ')
In [109]:
printtree(tree)
In [ ]: