In [1]:
import numpy as np
import pandas as pd
In [3]:
train_data = pd.read_csv('./train.csv', index_col="PassengerId") # 读入训练集
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-3-7aab94a59337> in <module>()
----> 1 train_data = pd.read_csv('./train.csv', index_col="PassengerId") # 读入训练集
NameError: name 'pd' is not defined
In [2]:
train_data
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-2-d76a646c9c28> in <module>()
----> 1 train_data
NameError: name 'train_data' is not defined
In [4]:
# 清洗数据
# 利用['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']这些维度的特征
def data_clean(data):
columns_of_X = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[columns_of_X]
X['Sex'].replace({'male':1, 'female':0}, inplace=True)
X['Age'].fillna(0, inplace=True) # 将Age中的NaN转换为0
X['Embarked'].replace({
'S': 0,
'C': 1,
'Q': 2,
np.nan: 3
}, inplace=True) # 将Embarked中的不同类别映射到数字域c
X.fillna(0,inplace=True)
X.applymap(lambda x: int(x)) #转换为int类型
return X
train_X = data_clean(train_data)
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:3117: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:2862: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:2705: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
**kwargs)
In [ ]:
In [17]:
train_X
Out[17]:
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked
PassengerId
1
3
1
22
1
0
7.2500
0
2
1
0
38
1
0
71.2833
1
3
3
0
26
0
0
7.9250
0
4
1
0
35
1
0
53.1000
0
5
3
1
35
0
0
8.0500
0
6
3
1
0
0
0
8.4583
2
7
1
1
54
0
0
51.8625
0
8
3
1
2
3
1
21.0750
0
9
3
0
27
0
2
11.1333
0
10
2
0
14
1
0
30.0708
1
11
3
0
4
1
1
16.7000
0
12
1
0
58
0
0
26.5500
0
13
3
1
20
0
0
8.0500
0
14
3
1
39
1
5
31.2750
0
15
3
0
14
0
0
7.8542
0
16
2
0
55
0
0
16.0000
0
17
3
1
2
4
1
29.1250
2
18
2
1
0
0
0
13.0000
0
19
3
0
31
1
0
18.0000
0
20
3
0
0
0
0
7.2250
1
21
2
1
35
0
0
26.0000
0
22
2
1
34
0
0
13.0000
0
23
3
0
15
0
0
8.0292
2
24
1
1
28
0
0
35.5000
0
25
3
0
8
3
1
21.0750
0
26
3
0
38
1
5
31.3875
0
27
3
1
0
0
0
7.2250
1
28
1
1
19
3
2
263.0000
0
29
3
0
0
0
0
7.8792
2
30
3
1
0
0
0
7.8958
0
...
...
...
...
...
...
...
...
862
2
1
21
1
0
11.5000
0
863
1
0
48
0
0
25.9292
0
864
3
0
0
8
2
69.5500
0
865
2
1
24
0
0
13.0000
0
866
2
0
42
0
0
13.0000
0
867
2
0
27
1
0
13.8583
1
868
1
1
31
0
0
50.4958
0
869
3
1
0
0
0
9.5000
0
870
3
1
4
1
1
11.1333
0
871
3
1
26
0
0
7.8958
0
872
1
0
47
1
1
52.5542
0
873
1
1
33
0
0
5.0000
0
874
3
1
47
0
0
9.0000
0
875
2
0
28
1
0
24.0000
1
876
3
0
15
0
0
7.2250
1
877
3
1
20
0
0
9.8458
0
878
3
1
19
0
0
7.8958
0
879
3
1
0
0
0
7.8958
0
880
1
0
56
0
1
83.1583
1
881
2
0
25
0
1
26.0000
0
882
3
1
33
0
0
7.8958
0
883
3
0
22
0
0
10.5167
0
884
2
1
28
0
0
10.5000
0
885
3
1
25
0
0
7.0500
0
886
3
0
39
0
5
29.1250
2
887
2
1
27
0
0
13.0000
0
888
1
0
19
0
0
30.0000
0
889
3
0
0
1
2
23.4500
0
890
1
1
26
0
0
30.0000
1
891
3
1
32
0
0
7.7500
2
891 rows × 7 columns
In [18]:
train_Y = train_data['Survived']
In [19]:
train_Y
Out[19]:
PassengerId
1 0
2 1
3 1
4 1
5 0
6 0
7 0
8 0
9 1
10 1
11 1
12 1
13 0
14 0
15 0
16 1
17 0
18 1
19 0
20 1
21 0
22 1
23 1
24 1
25 0
26 1
27 0
28 0
29 1
30 0
..
862 0
863 1
864 0
865 0
866 1
867 1
868 0
869 0
870 1
871 0
872 1
873 0
874 0
875 1
876 1
877 0
878 0
879 0
880 1
881 1
882 0
883 0
884 0
885 0
886 0
887 0
888 1
889 0
890 1
891 0
Name: Survived, dtype: int64
In [32]:
from sklearn import tree
from sklearn import cross_validation
In [39]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
train_X, train_Y,test_size=0.4,random_state=0)
In [40]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
Out[40]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
In [41]:
clf.score(X_test, y_test)
Out[41]:
0.77030812324929976
In [43]:
scores = cross_validation.cross_val_score(clf, train_X, train_Y,cv=5)
In [44]:
scores
Out[44]:
array([ 0.74301676, 0.74860335, 0.81460674, 0.75280899, 0.76271186])
In [50]:
# K折交叉验证,选择最优的决策树模型
train_X
Out[50]:
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked
PassengerId
1
3
1
22
1
0
7.2500
0
2
1
0
38
1
0
71.2833
1
3
3
0
26
0
0
7.9250
0
4
1
0
35
1
0
53.1000
0
5
3
1
35
0
0
8.0500
0
6
3
1
0
0
0
8.4583
2
7
1
1
54
0
0
51.8625
0
8
3
1
2
3
1
21.0750
0
9
3
0
27
0
2
11.1333
0
10
2
0
14
1
0
30.0708
1
11
3
0
4
1
1
16.7000
0
12
1
0
58
0
0
26.5500
0
13
3
1
20
0
0
8.0500
0
14
3
1
39
1
5
31.2750
0
15
3
0
14
0
0
7.8542
0
16
2
0
55
0
0
16.0000
0
17
3
1
2
4
1
29.1250
2
18
2
1
0
0
0
13.0000
0
19
3
0
31
1
0
18.0000
0
20
3
0
0
0
0
7.2250
1
21
2
1
35
0
0
26.0000
0
22
2
1
34
0
0
13.0000
0
23
3
0
15
0
0
8.0292
2
24
1
1
28
0
0
35.5000
0
25
3
0
8
3
1
21.0750
0
26
3
0
38
1
5
31.3875
0
27
3
1
0
0
0
7.2250
1
28
1
1
19
3
2
263.0000
0
29
3
0
0
0
0
7.8792
2
30
3
1
0
0
0
7.8958
0
...
...
...
...
...
...
...
...
862
2
1
21
1
0
11.5000
0
863
1
0
48
0
0
25.9292
0
864
3
0
0
8
2
69.5500
0
865
2
1
24
0
0
13.0000
0
866
2
0
42
0
0
13.0000
0
867
2
0
27
1
0
13.8583
1
868
1
1
31
0
0
50.4958
0
869
3
1
0
0
0
9.5000
0
870
3
1
4
1
1
11.1333
0
871
3
1
26
0
0
7.8958
0
872
1
0
47
1
1
52.5542
0
873
1
1
33
0
0
5.0000
0
874
3
1
47
0
0
9.0000
0
875
2
0
28
1
0
24.0000
1
876
3
0
15
0
0
7.2250
1
877
3
1
20
0
0
9.8458
0
878
3
1
19
0
0
7.8958
0
879
3
1
0
0
0
7.8958
0
880
1
0
56
0
1
83.1583
1
881
2
0
25
0
1
26.0000
0
882
3
1
33
0
0
7.8958
0
883
3
0
22
0
0
10.5167
0
884
2
1
28
0
0
10.5000
0
885
3
1
25
0
0
7.0500
0
886
3
0
39
0
5
29.1250
2
887
2
1
27
0
0
13.0000
0
888
1
0
19
0
0
30.0000
0
889
3
0
0
1
2
23.4500
0
890
1
1
26
0
0
30.0000
1
891
3
1
32
0
0
7.7500
2
891 rows × 7 columns
In [51]:
k_fold = cross_validation.KFold(len(train_X), 5, shuffle=True)
In [56]:
train_X.iloc[1]
Out[56]:
Pclass 1.0000
Sex 0.0000
Age 38.0000
SibSp 1.0000
Parch 0.0000
Fare 71.2833
Embarked 1.0000
Name: 2, dtype: float64
In [58]:
k_fold = cross_validation.KFold(len(train_X), 5, shuffle=True)
best_score = 0
best_clf = tree.DecisionTreeClassifier()
for k, (train_index, test_index) in enumerate(k_fold):
clf.fit(train_X.iloc[train_index], train_Y.iloc[train_index])
score = clf.score(train_X.iloc[test_index], train_Y.iloc[test_index])
print score
if score > best_score:
best_score = score
best_clf = clf
0.793296089385
0.76404494382
0.797752808989
0.820224719101
0.808988764045
In [23]:
test_data = pd.read_csv('./test.csv', index_col='PassengerId')
test_X = data_clean(test_data)
In [35]:
test_X
Out[35]:
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked
PassengerId
892
3
1
34.5
0
0
7.8292
2
893
3
0
47.0
1
0
7.0000
0
894
2
1
62.0
0
0
9.6875
2
895
3
1
27.0
0
0
8.6625
0
896
3
0
22.0
1
1
12.2875
0
897
3
1
14.0
0
0
9.2250
0
898
3
0
30.0
0
0
7.6292
2
899
2
1
26.0
1
1
29.0000
0
900
3
0
18.0
0
0
7.2292
1
901
3
1
21.0
2
0
24.1500
0
902
3
1
0.0
0
0
7.8958
0
903
1
1
46.0
0
0
26.0000
0
904
1
0
23.0
1
0
82.2667
0
905
2
1
63.0
1
0
26.0000
0
906
1
0
47.0
1
0
61.1750
0
907
2
0
24.0
1
0
27.7208
1
908
2
1
35.0
0
0
12.3500
2
909
3
1
21.0
0
0
7.2250
1
910
3
0
27.0
1
0
7.9250
0
911
3
0
45.0
0
0
7.2250
1
912
1
1
55.0
1
0
59.4000
1
913
3
1
9.0
0
1
3.1708
0
914
1
0
0.0
0
0
31.6833
0
915
1
1
21.0
0
1
61.3792
1
916
1
0
48.0
1
3
262.3750
1
917
3
1
50.0
1
0
14.5000
0
918
1
0
22.0
0
1
61.9792
1
919
3
1
22.5
0
0
7.2250
1
920
1
1
41.0
0
0
30.5000
0
921
3
1
0.0
2
0
21.6792
1
...
...
...
...
...
...
...
...
1280
3
1
21.0
0
0
7.7500
2
1281
3
1
6.0
3
1
21.0750
0
1282
1
1
23.0
0
0
93.5000
0
1283
1
0
51.0
0
1
39.4000
0
1284
3
1
13.0
0
2
20.2500
0
1285
2
1
47.0
0
0
10.5000
0
1286
3
1
29.0
3
1
22.0250
0
1287
1
0
18.0
1
0
60.0000
0
1288
3
1
24.0
0
0
7.2500
2
1289
1
0
48.0
1
1
79.2000
1
1290
3
1
22.0
0
0
7.7750
0
1291
3
1
31.0
0
0
7.7333
2
1292
1
0
30.0
0
0
164.8667
0
1293
2
1
38.0
1
0
21.0000
0
1294
1
0
22.0
0
1
59.4000
1
1295
1
1
17.0
0
0
47.1000
0
1296
1
1
43.0
1
0
27.7208
1
1297
2
1
20.0
0
0
13.8625
1
1298
2
1
23.0
1
0
10.5000
0
1299
1
1
50.0
1
1
211.5000
1
1300
3
0
0.0
0
0
7.7208
2
1301
3
0
3.0
1
1
13.7750
0
1302
3
0
0.0
0
0
7.7500
2
1303
1
0
37.0
1
0
90.0000
2
1304
3
0
28.0
0
0
7.7750
0
1305
3
1
0.0
0
0
8.0500
0
1306
1
0
39.0
0
0
108.9000
1
1307
3
1
38.5
0
0
7.2500
0
1308
3
1
0.0
0
0
8.0500
0
1309
3
1
0.0
1
1
22.3583
1
418 rows × 7 columns
In [59]:
predict_y = best_clf.predict(test_X)
In [60]:
predict_y
Out[60]:
array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
1, 0, 0, 1])
In [61]:
result_dt = pd.DataFrame(data=predict_y, index=range(892, 1310), columns=['Survived'])
result_dt
Out[61]:
Survived
892
0
893
0
894
1
895
1
896
0
897
0
898
0
899
0
900
0
901
0
902
0
903
1
904
1
905
0
906
1
907
1
908
0
909
1
910
0
911
0
912
0
913
1
914
1
915
1
916
1
917
0
918
1
919
1
920
1
921
0
...
...
1280
0
1281
0
1282
0
1283
1
1284
0
1285
0
1286
0
1287
1
1288
0
1289
1
1290
0
1291
0
1292
1
1293
0
1294
1
1295
1
1296
0
1297
1
1298
0
1299
0
1300
1
1301
1
1302
1
1303
1
1304
0
1305
0
1306
1
1307
0
1308
0
1309
1
418 rows × 1 columns
In [62]:
result_dt.to_csv('my_result.csv', index_label=['PassengerId'])
In [14]:
validation_Y = clf.predict(train_X)
In [15]:
validation_Y
Out[15]:
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])
In [16]:
train_Y
Out[16]:
PassengerId
1 0
2 1
3 1
4 1
5 0
6 0
7 0
8 0
9 1
10 1
11 1
12 1
13 0
14 0
15 0
16 1
17 0
18 1
19 0
20 1
21 0
22 1
23 1
24 1
25 0
26 1
27 0
28 0
29 1
30 0
..
862 0
863 1
864 0
865 0
866 1
867 1
868 0
869 0
870 1
871 0
872 1
873 0
874 0
875 1
876 1
877 0
878 0
879 0
880 1
881 1
882 0
883 0
884 0
885 0
886 0
887 0
888 1
889 0
890 1
891 0
Name: Survived, dtype: int64
In [43]:
1- 1.0*(train_Y - validation_Y).abs().sum() / len(train_Y)
Out[43]:
0.9820426487093153
In [44]:
len(train_Y)
Out[44]:
891
In [45]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
In [46]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),algorithm='SAMME')
In [47]:
bdt.fit(train_X, train_Y)
Out[47]:
AdaBoostClassifier(algorithm='SAMME',
base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best'),
learning_rate=1.0, n_estimators=50, random_state=None)
In [48]:
validation_Y = bdt.predict(train_X)
validation_Y
Out[48]:
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])
In [49]:
(train_Y - validation_Y).abs().sum()
Out[49]:
16
In [50]:
result = bdt.predict(test_X)
In [66]:
result_pd = pd.DataFrame(data=result, index=range(892, 1310), columns=['Survived'])
In [68]:
result_pd.index.name = 'PassengerId'
In [69]:
result_pd
Out[69]:
Survived
PassengerId
892
0
893
0
894
1
895
1
896
1
897
0
898
0
899
0
900
1
901
0
902
0
903
0
904
1
905
0
906
1
907
1
908
0
909
1
910
1
911
0
912
1
913
0
914
1
915
1
916
1
917
0
918
1
919
1
920
1
921
0
...
...
1280
0
1281
0
1282
0
1283
1
1284
0
1285
0
1286
0
1287
1
1288
0
1289
1
1290
0
1291
0
1292
1
1293
0
1294
1
1295
0
1296
0
1297
0
1298
0
1299
1
1300
1
1301
1
1302
1
1303
1
1304
0
1305
0
1306
1
1307
0
1308
0
1309
1
418 rows × 1 columns
In [53]:
result_pd.to_csv('adaboost.csv', index_label=['PassengerId'])
In [60]:
result_dt = pd.read_csv('my_result.csv', index_col='PassengerId')
result_dt.drop('0', axis=1)
Out[60]:
Survived
PassengerId
892
0
893
0
894
1
895
1
896
0
897
0
898
0
899
1
900
1
901
0
902
0
903
0
904
1
905
0
906
1
907
1
908
0
909
1
910
0
911
0
912
1
913
1
914
1
915
0
916
1
917
0
918
1
919
1
920
1
921
0
...
...
1280
1
1281
0
1282
1
1283
1
1284
0
1285
0
1286
0
1287
1
1288
0
1289
1
1290
0
1291
0
1292
1
1293
0
1294
1
1295
0
1296
0
1297
0
1298
0
1299
0
1300
1
1301
1
1302
1
1303
1
1304
1
1305
0
1306
1
1307
0
1308
0
1309
1
418 rows × 1 columns
In [55]:
result
Out[55]:
array([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
1, 0, 0, 1])
In [64]:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-64-6c57d35a3def> in <module>()
----> 1 result_pd.index = 'PassengerId'
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __setattr__(self, name, value)
2369 try:
2370 object.__getattribute__(self, name)
-> 2371 return object.__setattr__(self, name, value)
2372 except AttributeError:
2373 pass
pandas/src/properties.pyx in pandas.lib.AxisProperty.__set__ (pandas/lib.c:45002)()
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _set_axis(self, axis, labels)
423
424 def _set_axis(self, axis, labels):
--> 425 self._data.set_axis(axis, labels)
426 self._clear_item_cache()
427
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in set_axis(self, axis, new_labels)
2564
2565 def set_axis(self, axis, new_labels):
-> 2566 new_labels = _ensure_index(new_labels)
2567 old_len = len(self.axes[axis])
2568 new_len = len(new_labels)
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in _ensure_index(index_like, copy)
6092 index_like = copy(index_like)
6093
-> 6094 return Index(index_like)
6095
6096
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
190 **kwargs)
191 elif data is None or np.isscalar(data):
--> 192 cls._scalar_data_error(data)
193 else:
194 if tupleize_cols and isinstance(data, list) and data and isinstance(data[0], tuple):
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in _scalar_data_error(cls, data)
334 raise TypeError(
335 '{0}(...) must be called with a collection of some kind, {1} was '
--> 336 'passed'.format(cls.__name__, repr(data))
337 )
338
TypeError: Index(...) must be called with a collection of some kind, 'PassengerId' was passed
In [70]:
result_dt - result_pd
Out[70]:
0
Survived
PassengerId
892
NaN
0
893
NaN
0
894
NaN
0
895
NaN
0
896
NaN
-1
897
NaN
0
898
NaN
0
899
NaN
1
900
NaN
0
901
NaN
0
902
NaN
0
903
NaN
0
904
NaN
0
905
NaN
0
906
NaN
0
907
NaN
0
908
NaN
0
909
NaN
0
910
NaN
-1
911
NaN
0
912
NaN
0
913
NaN
1
914
NaN
0
915
NaN
-1
916
NaN
0
917
NaN
0
918
NaN
0
919
NaN
0
920
NaN
0
921
NaN
0
...
...
...
1280
NaN
1
1281
NaN
0
1282
NaN
1
1283
NaN
0
1284
NaN
0
1285
NaN
0
1286
NaN
0
1287
NaN
0
1288
NaN
0
1289
NaN
0
1290
NaN
0
1291
NaN
0
1292
NaN
0
1293
NaN
0
1294
NaN
0
1295
NaN
0
1296
NaN
0
1297
NaN
0
1298
NaN
0
1299
NaN
-1
1300
NaN
0
1301
NaN
0
1302
NaN
0
1303
NaN
0
1304
NaN
1
1305
NaN
0
1306
NaN
0
1307
NaN
0
1308
NaN
0
1309
NaN
0
418 rows × 2 columns