# Decision Tree

## 基本知识

• 算法
• ID3
• C4.5
• CART
``````

In [1]:

import numpy as np
import pandas as pd

``````

## 预处理

• 选取特征
• Sex
• Survived
• Pclass
• age
``````

In [2]:

``````
``````

In [3]:

``````
``````

Out[3]:

PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked

0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S

1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th...
female
38.0
1
0
PC 17599
71.2833
C85
C

2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S

3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S

4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S

``````
``````

In [4]:

df = df.loc[:, ['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'Survived']]

``````
``````

In [5]:

df = df.replace(["male", "female"], [0,1])

``````
``````

In [6]:

df = df[df.Age.notnull()]

``````
``````

In [7]:

``````
``````

Out[7]:

Pclass
Sex
Age
Fare
Parch
Survived

0
3
0
22.0
7.2500
0
0

1
1
1
38.0
71.2833
0
1

2
3
1
26.0
7.9250
0
1

3
1
1
35.0
53.1000
0
1

4
3
0
35.0
8.0500
0
0

``````
``````

In [8]:

df.describe()

``````
``````

Out[8]:

Pclass
Sex
Age
Fare
Parch
Survived

count
714.000000
714.000000
714.000000
714.000000
714.000000
714.000000

mean
2.236695
0.365546
29.699118
34.694514
0.431373
0.406162

std
0.838250
0.481921
14.526497
52.918930
0.853289
0.491460

min
1.000000
0.000000
0.420000
0.000000
0.000000
0.000000

25%
1.000000
0.000000
20.125000
8.050000
0.000000
0.000000

50%
2.000000
0.000000
28.000000
15.741700
0.000000
0.000000

75%
3.000000
1.000000
38.000000
33.375000
1.000000
1.000000

max
3.000000
1.000000
80.000000
512.329200
6.000000
1.000000

``````

## ID3

``````

In [9]:

def divideset(df, column, value):
if isinstance(value, float) or len(df.loc[:, column].unique()) >= 4:
set1 = df[df[column] >= value]
set2 = df[df[column] < value]
return [set1,set2]
else:
sets = []
for i in range(len(df[column].unique())):
sets.append(df[df[column] == df[column].unique()[i]])
return sets

``````
``````

In [10]:

def count_item(df, column='Survived'):

results = {}
results_list = df[column].unique()
for item in results_list:
results[item] = len(df[df[column] == item])
return results

``````
``````

In [11]:

def entropy(df, column='Survived'):
'''calculate the entropy
'''
log2=lambda x:np.log(x)/np.log(2)
results=count_item(df, column)
# Now calculate the entropy
ent=0.0
for r in results.keys():
p=float(results[r])/len(df)
ent=ent-p*log2(p)
return ent

``````
``````

In [12]:

class decisionnode:
def __init__(self, col='Survived', value=None, results=None, branch=None):
self.col = col
self.value = value
self.results = results
self.branch = branch

``````
``````

In [16]:

def buildtree(df, features, scoref=entropy, column='Survived'):

print len(df)
if len(df)==0: # check the items number of the set
return decisionnode()

current_score = scoref(df) # calculate the current entropy

if len(features) == 0:
return "Done!"

# initialization
best_gain = 0.0
best_criteria = None
best_sets = None
sets = []

#sfsf
for feature in features:
if isinstance(df[feature].values[0], float) or len(df.loc[:, feature].unique()) >= 4:
# for such different values, we use bisection method
column_values = df[df[feature] > 0.].loc[:, feature].unique()
for value in column_values:
sets = divideset(df, feature, value)
p = float(len(sets))/len(df) #p is the size of a child set relative to its parent
gain = current_score - p*scoref(sets[0]) - (1 - p)*scoref(sets[1]) #cf. formula information gain
# print 'Gain:'+str(gain), 'Value:'+str(value)
if gain > best_gain and len(sets)> 0: #set must not be empty
best_gain = gain
best_criteria = (feature, value)
best_sets = sets
print 'Feature:' + feature, 'Its best_gain:'+str(best_gain), 'best_criteria:'+str(best_criteria)
else:
sets = divideset(df, feature, 0)
E_wight = 0
for i in range(len(sets)):
E_wight += float(len(sets[i]))/len(df)*scoref(sets[i])
gain = current_score - E_wight
# print 'Gain:'+str(gain)
if gain > best_gain and len(sets) > 0: #set must not be empty
best_gain = gain
best_criteria = (feature, df.loc[:, feature].unique())
best_sets = sets
print 'Feature:' + feature, 'Its best_gain:'+str(best_gain), 'best_criteria:'+str(best_criteria)
print 'best_gain:'+str(best_gain), 'best_criteria:'+str(best_criteria)
if best_gain > 0:
i = 0
for feature in features:
if feature == best_criteria[0]:
index = i
break
i += 1

features = np.delete(features, index)
print features
branches = []
for j in range(len(best_sets)):
print str(j+1)+'loop'
branches.append(buildtree(best_sets[j], features))
print str(j)+features
return decisionnode(col=best_criteria[0], value=best_criteria[1], branch=branches)
else:
return decisionnode(results=count_item(df))

``````
``````

In [17]:

features = df.columns.values[0: -1]

``````
``````

In [18]:

tree = buildtree(df, features)

``````
``````

714
Feature:Pclass Its best_gain:0.0956802853653 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Sex Its best_gain:0.216016060752 best_criteria:('Sex', array([0, 1]))
Feature:Age Its best_gain:0.971721243718 best_criteria:('Age', 1.0)
Feature:Fare Its best_gain:0.971721243718 best_criteria:('Age', 1.0)
Feature:Parch Its best_gain:0.971721243718 best_criteria:('Age', 1.0)
best_gain:0.971721243718 best_criteria:('Age', 1.0)
['Pclass' 'Sex' 'Fare' 'Parch']
1loop
707
Feature:Pclass Its best_gain:0.0979064882575 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Sex Its best_gain:0.222711526805 best_criteria:('Sex', array([0, 1]))
Feature:Fare Its best_gain:0.632020377971 best_criteria:('Fare', 6.9749999999999996)
Feature:Parch Its best_gain:0.632020377971 best_criteria:('Fare', 6.9749999999999996)
best_gain:0.632020377971 best_criteria:('Fare', 6.9749999999999996)
['Pclass' 'Sex' 'Parch']
1loop
691
Feature:Pclass Its best_gain:0.100294027175 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Sex Its best_gain:0.222539448199 best_criteria:('Sex', array([0, 1]))
Feature:Parch Its best_gain:0.222539448199 best_criteria:('Sex', array([0, 1]))
best_gain:0.222539448199 best_criteria:('Sex', array([0, 1]))
['Pclass' 'Parch']
1loop
433
Feature:Pclass Its best_gain:0.0486476263771 best_criteria:('Pclass', array([3, 1, 2]))
Feature:Parch Its best_gain:0.0486476263771 best_criteria:('Pclass', array([3, 1, 2]))
best_gain:0.0486476263771 best_criteria:('Pclass', array([3, 1, 2]))
['Parch']
1loop
241
Feature:Parch Its best_gain:0.0212363604493 best_criteria:('Parch', 1)
best_gain:0.0212363604493 best_criteria:('Parch', 1)
[]
1loop
42
[]
2loop
199
[]
['0Parch']
2loop
96
Feature:Parch Its best_gain:0.0180628970017 best_criteria:('Parch', 4)
best_gain:0.0180628970017 best_criteria:('Parch', 4)
[]
1loop
1
[]
2loop
95
[]
['1Parch']
3loop
96
Feature:Parch Its best_gain:0.067017973327 best_criteria:('Parch', array([0, 1, 2]))
best_gain:0.067017973327 best_criteria:('Parch', array([0, 1, 2]))
[]
1loop
80
[]
2loop
12
[]
3loop
4
[]
['2Parch']
['0Pclass' '0Parch']
2loop
258
Feature:Pclass Its best_gain:0.231485333579 best_criteria:('Pclass', array([1, 3, 2]))
Feature:Parch Its best_gain:0.231485333579 best_criteria:('Pclass', array([1, 3, 2]))
best_gain:0.231485333579 best_criteria:('Pclass', array([1, 3, 2]))
['Parch']
1loop
85
Feature:Parch Its best_gain:0.040411973749 best_criteria:('Parch', array([0, 2, 1]))
best_gain:0.040411973749 best_criteria:('Parch', array([0, 2, 1]))
[]
1loop
56
[]
2loop
13
[]
3loop
16
[]
['0Parch']
2loop
99
Feature:Parch Its best_gain:0.0189473465243 best_criteria:('Parch', 6)
best_gain:0.0189473465243 best_criteria:('Parch', 6)
[]
1loop
1
[]
2loop
98
[]
['1Parch']
3loop
74
Feature:Parch Its best_gain:0.00334442736933 best_criteria:('Parch', 3)
best_gain:0.00334442736933 best_criteria:('Parch', 3)
[]
1loop
2
[]
2loop
72
[]
['2Parch']
['1Pclass' '1Parch']
['0Pclass' '0Sex' '0Parch']
2loop
16
Feature:Pclass Its best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
Feature:Sex Its best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
Feature:Parch Its best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
best_gain:0.0269274288893 best_criteria:('Pclass', array([3, 1]))
['Sex' 'Parch']
1loop
12
Feature:Sex Its best_gain:0.0109446122922 best_criteria:('Sex', array([0, 1]))
Feature:Parch Its best_gain:0.0109446122922 best_criteria:('Sex', array([0, 1]))
best_gain:0.0109446122922 best_criteria:('Sex', array([0, 1]))
['Parch']
1loop
11
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['0Parch']
2loop
1
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['1Parch']
['0Sex' '0Parch']
2loop
4
Feature:Sex Its best_gain:0.0 best_criteria:None
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['1Sex' '1Parch']
['1Pclass' '1Sex' '1Parch']
['0Pclass' '0Sex' '0Fare' '0Parch']
2loop
7
Feature:Pclass Its best_gain:0.0 best_criteria:None
Feature:Sex Its best_gain:0.0 best_criteria:None
Feature:Fare Its best_gain:0.0 best_criteria:None
Feature:Parch Its best_gain:0.0 best_criteria:None
best_gain:0.0 best_criteria:None
['1Pclass' '1Sex' '1Fare' '1Parch']

``````

## Ps: 该例子未完成...

``````

In [ ]:

``````