In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline









    



/Users/xpgeng/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

The Example of Machine Learning in Action



In [268]:

    
def Ent(df, feature):
    '''Parameters: 
            df: dataframe
            feature: the column index
       return: entropy
    '''
    entropy = 0
    p = 0
    items = df.loc[:, feature].unique()
    for item in items:
        p = float(len(df[df[feature] == item]))/len(df)
        entropy += -p*(np.log(p)/np.log(2))
    return entropy



In [269]:

    
df = pd.DataFrame({'No_surfacing': [1,1,1,0,0], 'Flippers': [1,1,0,1,1], 'Class': [1,1,0,0,0]})



In [270]:

    
df









    Out[270]:






  
    
      
      Class
      Flippers
      No_surfacing
    
  
  
    
      0
      1
      1
      1
    
    
      1
      1
      1
      1
    
    
      2
      0
      0
      1
    
    
      3
      0
      1
      0
    
    
      4
      0
      1
      0



In [271]:

    
Ent(df,'Class')









    Out[271]:





0.97095059445466858



In [272]:

    
def splitDataset(df, feature, value):
    '''parameters:
            df: dataframe
            feature: column index
            value: column value
       return:
            df1: dataframe, a subset of df
    '''
    df1 = df[df[feature] == value].drop(feature, 1)
    return df1



In [273]:

    
def splitDataset_by_IG(df):
    '''parameter:
           df: dataframe
       returns:
           bestIG: the biggest IG
           bestFeature: according the bestIG
           bestsubset: according the bestIG
    '''
    features = df.columns[1:]
    print 'features:'+features
    values = []
    baseEntropy = Ent(df, 'Class')
    bestIG = 0.0
    bestFeature = None
    
    for feature in features:
        values = df[feature].unique()
        NewEntropy = 0.0
        for value in values:
            print 'the values use to split:'+str(value)
            subset = splitDataset(df, feature, value)
            p = float(float(len(subset))/len(df))
            NewEntropy += p*Ent(subset, 'Class')
        IG = baseEntropy - NewEntropy
        if IG > bestIG :
            bestIG = IG
            bestFeature = feature
            bestSubset = subset
    return bestIG, bestFeature, bestSubset



In [274]:

    
def majorityChoice(df, feature='Class'):
    '''
    parameters:
           df: dataframe
           feature: 'Class'
    return:
           class value
    '''
    classcount = df[feature].value_counts().sort_index()
    if classcount[0] >= classcount[1]:
        return classcount.index.values[0]
    else:
        return classcount.index.values[1]



In [275]:

    
def createTree(df):
    '''
    df: dataframe
    
    myTree: a dict of Tree
    '''
    classList = df['Class']
    if len(classList.unique()) == 1:
        return classList.values[0]
    if len(df.columns) == 1:
        return majorityChoice(df, 'Class')
    bestIG, bestFeature, bestSubset = splitDataset_by_IG(df)
    myTree = {bestFeature:{}}
    uniqueValues = df[bestFeature].unique()
    for value in uniqueValues:
        myTree[bestFeature][value] = createTree(splitDataset(df, bestFeature, value))
    return myTree



In [276]:

    
myTree = createTree(df)









    



Index([u'features:Flippers', u'features:No_surfacing'], dtype='object')
the values use to split:1
the values use to split:0
the values use to split:1
the values use to split:0
Index([u'features:Flippers'], dtype='object')
the values use to split:1
the values use to split:0



In [277]:

    
myTree









    Out[277]:





{'No_surfacing': {0: 0, 1: {'Flippers': {0: 0, 1: 1}}}}



In [ ]:



In [ ]:

总结

写Decision Tree需要这样几个函数
- 依据特征及其所取值划分集合
- 计算Information Gain, 并依据IG选取最优特征及集合划分.
- 判定叶结点所属类别
- 主函数: recursivly生成Tree.
  - 这里需要注意的是: 树的生成要跟自己所属的节点对应上.
  - 终止条件:
    - 属性用完
    - 所有实例属于同一类



In [ ]: