In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


/Users/xpgeng/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

The Example of Machine Learning in Action


In [268]:
def Ent(df, feature):
    '''Parameters: 
            df: dataframe
            feature: the column index
       return: entropy
    '''
    entropy = 0
    p = 0
    items = df.loc[:, feature].unique()
    for item in items:
        p = float(len(df[df[feature] == item]))/len(df)
        entropy += -p*(np.log(p)/np.log(2))
    return entropy

In [269]:
df = pd.DataFrame({'No_surfacing': [1,1,1,0,0], 'Flippers': [1,1,0,1,1], 'Class': [1,1,0,0,0]})

In [270]:
df


Out[270]:
Class Flippers No_surfacing
0 1 1 1
1 1 1 1
2 0 0 1
3 0 1 0
4 0 1 0

In [271]:
Ent(df,'Class')


Out[271]:
0.97095059445466858

In [272]:
def splitDataset(df, feature, value):
    '''parameters:
            df: dataframe
            feature: column index
            value: column value
       return:
            df1: dataframe, a subset of df
    '''
    df1 = df[df[feature] == value].drop(feature, 1)
    return df1

In [273]:
def splitDataset_by_IG(df):
    '''parameter:
           df: dataframe
       returns:
           bestIG: the biggest IG
           bestFeature: according the bestIG
           bestsubset: according the bestIG
    '''
    features = df.columns[1:]
    print 'features:'+features
    values = []
    baseEntropy = Ent(df, 'Class')
    bestIG = 0.0
    bestFeature = None
    
    for feature in features:
        values = df[feature].unique()
        NewEntropy = 0.0
        for value in values:
            print 'the values use to split:'+str(value)
            subset = splitDataset(df, feature, value)
            p = float(float(len(subset))/len(df))
            NewEntropy += p*Ent(subset, 'Class')
        IG = baseEntropy - NewEntropy
        if IG > bestIG :
            bestIG = IG
            bestFeature = feature
            bestSubset = subset
    return bestIG, bestFeature, bestSubset

In [274]:
def majorityChoice(df, feature='Class'):
    '''
    parameters:
           df: dataframe
           feature: 'Class'
    return:
           class value
    '''
    classcount = df[feature].value_counts().sort_index()
    if classcount[0] >= classcount[1]:
        return classcount.index.values[0]
    else:
        return classcount.index.values[1]

In [275]:
def createTree(df):
    '''
    df: dataframe
    
    myTree: a dict of Tree
    '''
    classList = df['Class']
    if len(classList.unique()) == 1:
        return classList.values[0]
    if len(df.columns) == 1:
        return majorityChoice(df, 'Class')
    bestIG, bestFeature, bestSubset = splitDataset_by_IG(df)
    myTree = {bestFeature:{}}
    uniqueValues = df[bestFeature].unique()
    for value in uniqueValues:
        myTree[bestFeature][value] = createTree(splitDataset(df, bestFeature, value))
    return myTree

In [276]:
myTree = createTree(df)


Index([u'features:Flippers', u'features:No_surfacing'], dtype='object')
the values use to split:1
the values use to split:0
the values use to split:1
the values use to split:0
Index([u'features:Flippers'], dtype='object')
the values use to split:1
the values use to split:0

In [277]:
myTree


Out[277]:
{'No_surfacing': {0: 0, 1: {'Flippers': {0: 0, 1: 1}}}}

In [ ]:


In [ ]:

总结

  • 写Decision Tree需要这样几个函数
    • 依据特征及其所取值划分集合
    • 计算Information Gain, 并依据IG选取最优特征及集合划分.
    • 判定叶结点所属类别
    • 主函数: recursivly生成Tree.
      • 这里需要注意的是: 树的生成要跟自己所属的节点对应上.
      • 终止条件:
        • 属性用完
        • 所有实例属于同一类

In [ ]: