Decision Tree

基本知识

  • 算法
    • ID3
    • C4.5
    • CART

In [1]:
import numpy as np
import pandas as pd

预处理

  • 选取特征
    • Sex
    • Survived
    • Pclass
    • age

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()


Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [4]:
df = df.loc[:, ['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'Survived']]

In [5]:
df = df.replace(["male", "female"], [0,1])

In [6]:
df = df[df.Age.notnull()]

In [7]:
df.head()


Out[7]:
Pclass Sex Age Fare Parch Survived
0 3 0 22.0 7.2500 0 0
1 1 1 38.0 71.2833 0 1
2 3 1 26.0 7.9250 0 1
3 1 1 35.0 53.1000 0 1
4 3 0 35.0 8.0500 0 0

In [8]:
df.describe()


Out[8]:
Pclass Sex Age Fare Parch Survived
count 714.000000 714.000000 714.000000 714.000000 714.000000 714.000000
mean 2.236695 0.365546 29.699118 34.694514 0.431373 0.406162
std 0.838250 0.481921 14.526497 52.918930 0.853289 0.491460
min 1.000000 0.000000 0.420000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 20.125000 8.050000 0.000000 0.000000
50% 2.000000 0.000000 28.000000 15.741700 0.000000 0.000000
75% 3.000000 1.000000 38.000000 33.375000 1.000000 1.000000
max 3.000000 1.000000 80.000000 512.329200 6.000000 1.000000

ID3


In [9]:
def divideset(df, column, value):
    if isinstance(value, float) or len(df.loc[:, column].unique()) >= 4:
        set1 = df[df[column] >= value]
        set2 = df[df[column] < value]
        return [set1,set2]
    else:
        sets = []
        for i in range(len(df[column].unique())):
            sets.append(df[df[column] == df[column].unique()[i]])
        return sets

In [10]:
def count_item(df, column='Survived'):

    results = {}
    results_list = df[column].unique()
    for item in results_list:
        results[item] = len(df[df[column] == item])
    return results

In [11]:
def entropy(df, column='Survived'):
    '''calculate the entropy
    '''
    log2=lambda x:np.log(x)/np.log(2) 
    results=count_item(df, column)
    # Now calculate the entropy
    ent=0.0
    for r in results.keys():
        p=float(results[r])/len(df)
        ent=ent-p*log2(p)
    return ent

In [12]:
class decisionnode:
    def __init__(self, col='Survived', value=None, results=None, branch=None):
        self.col = col
        self.value = value
        self.results = results
        self.branch = branch

In [16]:
def buildtree(df, features, scoref=entropy, column='Survived'):
    
    print len(df)
    if len(df)==0: # check the items number of the set
        return decisionnode()
    
    current_score = scoref(df) # calculate the current entropy
    
    if len(features) == 0:
        return "Done!"
    
    # initialization
    best_gain = 0.0
    best_criteria = None
    best_sets = None
    sets = []
    
    #sfsf
    for feature in features:
        if isinstance(df[feature].values[0], float) or len(df.loc[:, feature].unique()) >= 4:
            # for such different values, we use bisection method 
            column_values = df[df[feature] > 0.].loc[:, feature].unique()         
            for value in column_values:
                sets = divideset(df, feature, value)
                p = float(len(sets))/len(df) #p is the size of a child set relative to its parent
                gain = current_score - p*scoref(sets[0]) - (1 - p)*scoref(sets[1]) #cf. formula information gain
                # print 'Gain:'+str(gain), 'Value:'+str(value)
                if gain > best_gain and len(sets)> 0: #set must not be empty
                    best_gain = gain
                    best_criteria = (feature, value)
                    best_sets = sets
            print 'Feature:' + feature, 'Its best_gain:'+str(best_gain), 'best_criteria:'+str(best_criteria)
        else:
            sets = divideset(df, feature, 0)
            E_wight = 0
            for i in range(len(sets)):
                E_wight += float(len(sets[i]))/len(df)*scoref(sets[i])
            gain = current_score - E_wight
            # print 'Gain:'+str(gain)
            if gain > best_gain and len(sets) > 0: #set must not be empty
                best_gain = gain
                best_criteria = (feature, df.loc[:, feature].unique())
                best_sets = sets
            print 'Feature:' + feature, 'Its best_gain:'+str(best_gain), 'best_criteria:'+str(best_criteria)
    print 'best_gain:'+str(best_gain), 'best_criteria:'+str(best_criteria)
    if best_gain > 0:
        i = 0
        for feature in features:         
            if feature == best_criteria[0]:
                index = i
                break
            i += 1
                
        features = np.delete(features, index)
        print features
        branches = []
        for j in range(len(best_sets)):
            print str(j+1)+'loop'
            branches.append(buildtree(best_sets[j], features))
            print str(j)+features
        return decisionnode(col=best_criteria[0], value=best_criteria[1], branch=branches)
    else:
        return decisionnode(results=count_item(df))

In [17]:
features = df.columns.values[0: -1]

In [18]:
tree = buildtree(df, features)


714
Feature:Pclass Its best_gain:0.0956802853653 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Sex Its best_gain:0.216016060752 best_criteria:('Sex', array([0, 1]))
Feature:Age Its best_gain:0.971721243718 best_criteria:('Age', 1.0)
Feature:Fare Its best_gain:0.971721243718 best_criteria:('Age', 1.0)
Feature:Parch Its best_gain:0.971721243718 best_criteria:('Age', 1.0)
best_gain:0.971721243718 best_criteria:('Age', 1.0)
['Pclass' 'Sex' 'Fare' 'Parch']
1loop
707
Feature:Pclass Its best_gain:0.0979064882575 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Sex Its best_gain:0.222711526805 best_criteria:('Sex', array([0, 1]))
Feature:Fare Its best_gain:0.632020377971 best_criteria:('Fare', 6.9749999999999996)
Feature:Parch Its best_gain:0.632020377971 best_criteria:('Fare', 6.9749999999999996)
best_gain:0.632020377971 best_criteria:('Fare', 6.9749999999999996)
['Pclass' 'Sex' 'Parch']
1loop
691
Feature:Pclass Its best_gain:0.100294027175 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Sex Its best_gain:0.222539448199 best_criteria:('Sex', array([0, 1]))
Feature:Parch Its best_gain:0.222539448199 best_criteria:('Sex', array([0, 1]))
best_gain:0.222539448199 best_criteria:('Sex', array([0, 1]))
['Pclass' 'Parch']
1loop
433
Feature:Pclass Its best_gain:0.0486476263771 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Parch Its best_gain:0.0486476263771 best_criteria:('Pclass', array([3, 1, 2]))
best_gain:0.0486476263771 best_criteria:('Pclass', array([3, 1, 2]))
['Parch']
1loop
241
Feature:Parch Its best_gain:0.0212363604493 best_criteria:('Parch', 1)
best_gain:0.0212363604493 best_criteria:('Parch', 1)
[]
1loop
42
[]
2loop
199
[]
['0Parch']
2loop
96
Feature:Parch Its best_gain:0.0180628970017 best_criteria:('Parch', 4)
best_gain:0.0180628970017 best_criteria:('Parch', 4)
[]
1loop
1
[]
2loop
95
[]
['1Parch']
3loop
96
Feature:Parch Its best_gain:0.067017973327 best_criteria:('Parch', array([0, 1, 2]))
best_gain:0.067017973327 best_criteria:('Parch', array([0, 1, 2]))
[]
1loop
80
[]
2loop
12
[]
3loop
4
[]
['2Parch']
['0Pclass' '0Parch']
2loop
258
Feature:Pclass Its best_gain:0.231485333579 best_criteria:('Pclass', array([1, 3, 2]))
Feature:Parch Its best_gain:0.231485333579 best_criteria:('Pclass', array([1, 3, 2]))
best_gain:0.231485333579 best_criteria:('Pclass', array([1, 3, 2]))
['Parch']
1loop
85
Feature:Parch Its best_gain:0.040411973749 best_criteria:('Parch', array([0, 2, 1]))
best_gain:0.040411973749 best_criteria:('Parch', array([0, 2, 1]))
[]
1loop
56
[]
2loop
13
[]
3loop
16
[]
['0Parch']
2loop
99
Feature:Parch Its best_gain:0.0189473465243 best_criteria:('Parch', 6)
best_gain:0.0189473465243 best_criteria:('Parch', 6)
[]
1loop
1
[]
2loop
98
[]
['1Parch']
3loop
74
Feature:Parch Its best_gain:0.00334442736933 best_criteria:('Parch', 3)
best_gain:0.00334442736933 best_criteria:('Parch', 3)
[]
1loop
2
[]
2loop
72
[]
['2Parch']
['1Pclass' '1Parch']
['0Pclass' '0Sex' '0Parch']
2loop
16
Feature:Pclass Its best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
Feature:Sex Its best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
Feature:Parch Its best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
['Sex' 'Parch']
1loop
12
Feature:Sex Its best_gain:0.0109446122922 best_criteria:('Sex', array([0, 1]))
Feature:Parch Its best_gain:0.0109446122922 best_criteria:('Sex', array([0, 1]))
best_gain:0.0109446122922 best_criteria:('Sex', array([0, 1]))
['Parch']
1loop
11
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['0Parch']
2loop
1
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['1Parch']
['0Sex' '0Parch']
2loop
4
Feature:Sex Its best_gain:0.0 best_criteria:None
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['1Sex' '1Parch']
['1Pclass' '1Sex' '1Parch']
['0Pclass' '0Sex' '0Fare' '0Parch']
2loop
7
Feature:Pclass Its best_gain:0.0 best_criteria:None
Feature:Sex Its best_gain:0.0 best_criteria:None
Feature:Fare Its best_gain:0.0 best_criteria:None
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['1Pclass' '1Sex' '1Fare' '1Parch']

Ps: 该例子未完成...


In [ ]: