Building a decision tree from scratch - a beginner tutorial

  • 在这个例子中, 使用了numpy, pandas
  • 可以更之前的例子比较, 简化了一些.

In [2]:
import numpy as np
import pandas as pd

In [83]:
my_data=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]

In [84]:
df = pd.DataFrame(my_data)

In [85]:
df = df.rename(columns = {0:'Referer', 1:'Country', 2:'Read', 3:'Visit', 4:'Subscription'})

In [86]:
# define nodes class
class decisionnode:
    def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
        self.col=col
        self.value=value
        self.results=results
        self.tb=tb
        self.fb=fb

In [87]:
def divideset(df, column, value):
    """according value to divide sets into two parts
    """
    if isinstance(value,int) or isinstance(value,float):
        set1 = df[df[column] >= value]
        set2 = df[df[column] < value]
    else:
        set1 = df[df[column] == value]
        set2 = df[df[column] != value]
    return (set1,set2)

In [88]:
def uniquecounts(df, column='Subscription'):
    '''count the different class 
    '''
    results = {}
    results_list = df[column].unique()
    for item in results_list:
        results[item] = len(df[df[column] == item])
    return results

In [89]:
uniquecounts(df, 'Subscription')


Out[89]:
{'Basic': 6, 'None': 7, 'Premium': 3}

In [90]:
uniquecounts(divideset(df, 'Visit', 20)[0])


Out[90]:
{'Basic': 3, 'None': 1, 'Premium': 3}

In [91]:
uniquecounts(divideset(df, 'Visit', 20)[1], 'Subscription')


Out[91]:
{'Basic': 3, 'None': 6}

In [92]:
def entropy(df, column='Subscription'):
    '''calculate the entropy
    '''
    log2=lambda x:np.log(x)/np.log(2) 
    results=uniquecounts(df, column)
    # Now calculate the entropy
    ent=0.0
    for r in results.keys():
        p=float(results[r])/len(df)
        ent=ent-p*log2(p)
    return ent

In [93]:
df1, df2 = divideset(df, 'Visit', 20)
entropy(df1, 'Subscription'), entropy(df2, 'Subscription')


Out[93]:
(1.4488156357251847, 0.91829583405448956)

In [94]:
entropy(df, 'Subscription')


Out[94]:
1.5052408149441479

In [95]:
class decisionnode:
    def __init__(self, col='Subscription', value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb

In [98]:
def buildtree(df, scoref=entropy, column='Subscription'):
    
    if len(df)==0: # check the items number of the set
        return decisionnode() 
    current_score = scoref(df) # calculate the current entropy
    
    # initialization
    best_gain = 0.0
    best_criteria = None
    best_sets = None
  
    columns = df.columns[0:-1] # pick up the features
    
    # calculate the best information gain
    for col in columns:
        global column_values        
        column_values = df.loc[:, col].values            
        for value in column_values: 
            (set1,set2) = divideset(df, col, value)
      
      # Information gain
            p = float(len(set1))/len(df) #p is the size of a child set relative to its parent
            gain = current_score - p*scoref(set1) - (1 - p)*scoref(set2) #cf. formula information gain
            if gain > best_gain and len(set1) > 0 and len(set2) > 0: #set must not be empty
                best_gain = gain
                best_criteria = (col,value)
                best_sets = (set1,set2)
        
    if best_gain > 0:
        trueBranch = buildtree(best_sets[0])
        falseBranch = buildtree(best_sets[1])
        return decisionnode(col=best_criteria[0], value=best_criteria[1],
                        tb=trueBranch, fb=falseBranch)
    else:
        return decisionnode(results=uniquecounts(df))

In [99]:
tree = buildtree(df)

In [100]:
print(tree.col)
print(tree.value)
print(tree.value)
print("")
print(tree.tb.col)
print(tree.tb.value)
print(tree.tb.results)
print("")
print(tree.tb.tb.col)
print(tree.tb.tb.value)
print(tree.tb.tb.results)
print("")
print(tree.tb.fb.col)
print(tree.tb.fb.value)
print(tree.tb.fb.results)


Referer
google
None

Visit
21
None

Subscription
None
{'Premium': 3}

Read
no
None

In [108]:
def printtree(tree,indent=''):
    if tree.results!=None:
        print str(tree.results)
    else:

        print str(tree.col)+':'+str(tree.value)+'? '


        print indent+'T->',
        printtree(tree.tb,indent+'  ')
        print indent+'F->',
        printtree(tree.fb,indent+'  ')

In [109]:
printtree(tree)


Referer:google? 
T-> Visit:21? 
  T-> {'Premium': 3}
  F-> Read:no? 
    T-> {'None': 1}
    F-> {'Basic': 1}
F-> Referer:slashdot? 
  T-> {'None': 3}
  F-> Read:yes? 
    T-> {'Basic': 4}
    F-> Visit:21? 
      T-> {'Basic': 1}
      F-> {'None': 3}

In [ ]: