decision_tree



In [1]:
import numpy as np
import pandas as pd

In [3]:
train_data = pd.read_csv('./train.csv', index_col="PassengerId")  # 读入训练集


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-7aab94a59337> in <module>()
----> 1 train_data = pd.read_csv('./train.csv', index_col="PassengerId")  # 读入训练集

NameError: name 'pd' is not defined

In [2]:
train_data


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-d76a646c9c28> in <module>()
----> 1 train_data

NameError: name 'train_data' is not defined

In [4]:
# 清洗数据
# 利用['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']这些维度的特征

def data_clean(data):
    columns_of_X = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = data[columns_of_X]
    X['Sex'].replace({'male':1, 'female':0}, inplace=True)
    X['Age'].fillna(0, inplace=True) # 将Age中的NaN转换为0
    X['Embarked'].replace({
        'S': 0,
        'C': 1,
        'Q': 2,
        np.nan: 3
    }, inplace=True)  # 将Embarked中的不同类别映射到数字域c
    X.fillna(0,inplace=True)
    X.applymap(lambda x: int(x))  #转换为int类型
    return X

train_X = data_clean(train_data)


/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:3117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:2862: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:2705: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)

In [ ]:


In [17]:
train_X


Out[17]:
Pclass Sex Age SibSp Parch Fare Embarked
PassengerId
1 3 1 22 1 0 7.2500 0
2 1 0 38 1 0 71.2833 1
3 3 0 26 0 0 7.9250 0
4 1 0 35 1 0 53.1000 0
5 3 1 35 0 0 8.0500 0
6 3 1 0 0 0 8.4583 2
7 1 1 54 0 0 51.8625 0
8 3 1 2 3 1 21.0750 0
9 3 0 27 0 2 11.1333 0
10 2 0 14 1 0 30.0708 1
11 3 0 4 1 1 16.7000 0
12 1 0 58 0 0 26.5500 0
13 3 1 20 0 0 8.0500 0
14 3 1 39 1 5 31.2750 0
15 3 0 14 0 0 7.8542 0
16 2 0 55 0 0 16.0000 0
17 3 1 2 4 1 29.1250 2
18 2 1 0 0 0 13.0000 0
19 3 0 31 1 0 18.0000 0
20 3 0 0 0 0 7.2250 1
21 2 1 35 0 0 26.0000 0
22 2 1 34 0 0 13.0000 0
23 3 0 15 0 0 8.0292 2
24 1 1 28 0 0 35.5000 0
25 3 0 8 3 1 21.0750 0
26 3 0 38 1 5 31.3875 0
27 3 1 0 0 0 7.2250 1
28 1 1 19 3 2 263.0000 0
29 3 0 0 0 0 7.8792 2
30 3 1 0 0 0 7.8958 0
... ... ... ... ... ... ... ...
862 2 1 21 1 0 11.5000 0
863 1 0 48 0 0 25.9292 0
864 3 0 0 8 2 69.5500 0
865 2 1 24 0 0 13.0000 0
866 2 0 42 0 0 13.0000 0
867 2 0 27 1 0 13.8583 1
868 1 1 31 0 0 50.4958 0
869 3 1 0 0 0 9.5000 0
870 3 1 4 1 1 11.1333 0
871 3 1 26 0 0 7.8958 0
872 1 0 47 1 1 52.5542 0
873 1 1 33 0 0 5.0000 0
874 3 1 47 0 0 9.0000 0
875 2 0 28 1 0 24.0000 1
876 3 0 15 0 0 7.2250 1
877 3 1 20 0 0 9.8458 0
878 3 1 19 0 0 7.8958 0
879 3 1 0 0 0 7.8958 0
880 1 0 56 0 1 83.1583 1
881 2 0 25 0 1 26.0000 0
882 3 1 33 0 0 7.8958 0
883 3 0 22 0 0 10.5167 0
884 2 1 28 0 0 10.5000 0
885 3 1 25 0 0 7.0500 0
886 3 0 39 0 5 29.1250 2
887 2 1 27 0 0 13.0000 0
888 1 0 19 0 0 30.0000 0
889 3 0 0 1 2 23.4500 0
890 1 1 26 0 0 30.0000 1
891 3 1 32 0 0 7.7500 2

891 rows × 7 columns


In [18]:
train_Y = train_data['Survived']

In [19]:
train_Y


Out[19]:
PassengerId
1      0
2      1
3      1
4      1
5      0
6      0
7      0
8      0
9      1
10     1
11     1
12     1
13     0
14     0
15     0
16     1
17     0
18     1
19     0
20     1
21     0
22     1
23     1
24     1
25     0
26     1
27     0
28     0
29     1
30     0
      ..
862    0
863    1
864    0
865    0
866    1
867    1
868    0
869    0
870    1
871    0
872    1
873    0
874    0
875    1
876    1
877    0
878    0
879    0
880    1
881    1
882    0
883    0
884    0
885    0
886    0
887    0
888    1
889    0
890    1
891    0
Name: Survived, dtype: int64

In [32]:
from sklearn import tree
from sklearn import cross_validation

In [39]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    train_X, train_Y,test_size=0.4,random_state=0)

In [40]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)


Out[40]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [41]:
clf.score(X_test, y_test)


Out[41]:
0.77030812324929976

In [43]:
scores = cross_validation.cross_val_score(clf, train_X, train_Y,cv=5)

In [44]:
scores


Out[44]:
array([ 0.74301676,  0.74860335,  0.81460674,  0.75280899,  0.76271186])

In [50]:
# K折交叉验证,选择最优的决策树模型
train_X


Out[50]:
Pclass Sex Age SibSp Parch Fare Embarked
PassengerId
1 3 1 22 1 0 7.2500 0
2 1 0 38 1 0 71.2833 1
3 3 0 26 0 0 7.9250 0
4 1 0 35 1 0 53.1000 0
5 3 1 35 0 0 8.0500 0
6 3 1 0 0 0 8.4583 2
7 1 1 54 0 0 51.8625 0
8 3 1 2 3 1 21.0750 0
9 3 0 27 0 2 11.1333 0
10 2 0 14 1 0 30.0708 1
11 3 0 4 1 1 16.7000 0
12 1 0 58 0 0 26.5500 0
13 3 1 20 0 0 8.0500 0
14 3 1 39 1 5 31.2750 0
15 3 0 14 0 0 7.8542 0
16 2 0 55 0 0 16.0000 0
17 3 1 2 4 1 29.1250 2
18 2 1 0 0 0 13.0000 0
19 3 0 31 1 0 18.0000 0
20 3 0 0 0 0 7.2250 1
21 2 1 35 0 0 26.0000 0
22 2 1 34 0 0 13.0000 0
23 3 0 15 0 0 8.0292 2
24 1 1 28 0 0 35.5000 0
25 3 0 8 3 1 21.0750 0
26 3 0 38 1 5 31.3875 0
27 3 1 0 0 0 7.2250 1
28 1 1 19 3 2 263.0000 0
29 3 0 0 0 0 7.8792 2
30 3 1 0 0 0 7.8958 0
... ... ... ... ... ... ... ...
862 2 1 21 1 0 11.5000 0
863 1 0 48 0 0 25.9292 0
864 3 0 0 8 2 69.5500 0
865 2 1 24 0 0 13.0000 0
866 2 0 42 0 0 13.0000 0
867 2 0 27 1 0 13.8583 1
868 1 1 31 0 0 50.4958 0
869 3 1 0 0 0 9.5000 0
870 3 1 4 1 1 11.1333 0
871 3 1 26 0 0 7.8958 0
872 1 0 47 1 1 52.5542 0
873 1 1 33 0 0 5.0000 0
874 3 1 47 0 0 9.0000 0
875 2 0 28 1 0 24.0000 1
876 3 0 15 0 0 7.2250 1
877 3 1 20 0 0 9.8458 0
878 3 1 19 0 0 7.8958 0
879 3 1 0 0 0 7.8958 0
880 1 0 56 0 1 83.1583 1
881 2 0 25 0 1 26.0000 0
882 3 1 33 0 0 7.8958 0
883 3 0 22 0 0 10.5167 0
884 2 1 28 0 0 10.5000 0
885 3 1 25 0 0 7.0500 0
886 3 0 39 0 5 29.1250 2
887 2 1 27 0 0 13.0000 0
888 1 0 19 0 0 30.0000 0
889 3 0 0 1 2 23.4500 0
890 1 1 26 0 0 30.0000 1
891 3 1 32 0 0 7.7500 2

891 rows × 7 columns


In [51]:
k_fold = cross_validation.KFold(len(train_X), 5, shuffle=True)

In [56]:
train_X.iloc[1]


Out[56]:
Pclass       1.0000
Sex          0.0000
Age         38.0000
SibSp        1.0000
Parch        0.0000
Fare        71.2833
Embarked     1.0000
Name: 2, dtype: float64

In [58]:
k_fold = cross_validation.KFold(len(train_X), 5, shuffle=True)
best_score = 0
best_clf = tree.DecisionTreeClassifier()
for k, (train_index, test_index) in enumerate(k_fold):
    clf.fit(train_X.iloc[train_index], train_Y.iloc[train_index])
    score = clf.score(train_X.iloc[test_index], train_Y.iloc[test_index])
    print score
    if score > best_score:
        best_score = score
        best_clf = clf


0.793296089385
0.76404494382
0.797752808989
0.820224719101
0.808988764045

In [23]:
test_data = pd.read_csv('./test.csv', index_col='PassengerId')
test_X = data_clean(test_data)

In [35]:
test_X


Out[35]:
Pclass Sex Age SibSp Parch Fare Embarked
PassengerId
892 3 1 34.5 0 0 7.8292 2
893 3 0 47.0 1 0 7.0000 0
894 2 1 62.0 0 0 9.6875 2
895 3 1 27.0 0 0 8.6625 0
896 3 0 22.0 1 1 12.2875 0
897 3 1 14.0 0 0 9.2250 0
898 3 0 30.0 0 0 7.6292 2
899 2 1 26.0 1 1 29.0000 0
900 3 0 18.0 0 0 7.2292 1
901 3 1 21.0 2 0 24.1500 0
902 3 1 0.0 0 0 7.8958 0
903 1 1 46.0 0 0 26.0000 0
904 1 0 23.0 1 0 82.2667 0
905 2 1 63.0 1 0 26.0000 0
906 1 0 47.0 1 0 61.1750 0
907 2 0 24.0 1 0 27.7208 1
908 2 1 35.0 0 0 12.3500 2
909 3 1 21.0 0 0 7.2250 1
910 3 0 27.0 1 0 7.9250 0
911 3 0 45.0 0 0 7.2250 1
912 1 1 55.0 1 0 59.4000 1
913 3 1 9.0 0 1 3.1708 0
914 1 0 0.0 0 0 31.6833 0
915 1 1 21.0 0 1 61.3792 1
916 1 0 48.0 1 3 262.3750 1
917 3 1 50.0 1 0 14.5000 0
918 1 0 22.0 0 1 61.9792 1
919 3 1 22.5 0 0 7.2250 1
920 1 1 41.0 0 0 30.5000 0
921 3 1 0.0 2 0 21.6792 1
... ... ... ... ... ... ... ...
1280 3 1 21.0 0 0 7.7500 2
1281 3 1 6.0 3 1 21.0750 0
1282 1 1 23.0 0 0 93.5000 0
1283 1 0 51.0 0 1 39.4000 0
1284 3 1 13.0 0 2 20.2500 0
1285 2 1 47.0 0 0 10.5000 0
1286 3 1 29.0 3 1 22.0250 0
1287 1 0 18.0 1 0 60.0000 0
1288 3 1 24.0 0 0 7.2500 2
1289 1 0 48.0 1 1 79.2000 1
1290 3 1 22.0 0 0 7.7750 0
1291 3 1 31.0 0 0 7.7333 2
1292 1 0 30.0 0 0 164.8667 0
1293 2 1 38.0 1 0 21.0000 0
1294 1 0 22.0 0 1 59.4000 1
1295 1 1 17.0 0 0 47.1000 0
1296 1 1 43.0 1 0 27.7208 1
1297 2 1 20.0 0 0 13.8625 1
1298 2 1 23.0 1 0 10.5000 0
1299 1 1 50.0 1 1 211.5000 1
1300 3 0 0.0 0 0 7.7208 2
1301 3 0 3.0 1 1 13.7750 0
1302 3 0 0.0 0 0 7.7500 2
1303 1 0 37.0 1 0 90.0000 2
1304 3 0 28.0 0 0 7.7750 0
1305 3 1 0.0 0 0 8.0500 0
1306 1 0 39.0 0 0 108.9000 1
1307 3 1 38.5 0 0 7.2500 0
1308 3 1 0.0 0 0 8.0500 0
1309 3 1 0.0 1 1 22.3583 1

418 rows × 7 columns


In [59]:
predict_y = best_clf.predict(test_X)

In [60]:
predict_y


Out[60]:
array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1])

In [61]:
result_dt = pd.DataFrame(data=predict_y, index=range(892, 1310), columns=['Survived'])
result_dt


Out[61]:
Survived
892 0
893 0
894 1
895 1
896 0
897 0
898 0
899 0
900 0
901 0
902 0
903 1
904 1
905 0
906 1
907 1
908 0
909 1
910 0
911 0
912 0
913 1
914 1
915 1
916 1
917 0
918 1
919 1
920 1
921 0
... ...
1280 0
1281 0
1282 0
1283 1
1284 0
1285 0
1286 0
1287 1
1288 0
1289 1
1290 0
1291 0
1292 1
1293 0
1294 1
1295 1
1296 0
1297 1
1298 0
1299 0
1300 1
1301 1
1302 1
1303 1
1304 0
1305 0
1306 1
1307 0
1308 0
1309 1

418 rows × 1 columns


In [62]:
result_dt.to_csv('my_result.csv', index_label=['PassengerId'])

In [14]:
validation_Y = clf.predict(train_X)

In [15]:
validation_Y


Out[15]:
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [16]:
train_Y


Out[16]:
PassengerId
1      0
2      1
3      1
4      1
5      0
6      0
7      0
8      0
9      1
10     1
11     1
12     1
13     0
14     0
15     0
16     1
17     0
18     1
19     0
20     1
21     0
22     1
23     1
24     1
25     0
26     1
27     0
28     0
29     1
30     0
      ..
862    0
863    1
864    0
865    0
866    1
867    1
868    0
869    0
870    1
871    0
872    1
873    0
874    0
875    1
876    1
877    0
878    0
879    0
880    1
881    1
882    0
883    0
884    0
885    0
886    0
887    0
888    1
889    0
890    1
891    0
Name: Survived, dtype: int64

In [43]:
1- 1.0*(train_Y - validation_Y).abs().sum() / len(train_Y)


Out[43]:
0.9820426487093153

In [44]:
len(train_Y)


Out[44]:
891

使用Adaboost


In [45]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [46]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),algorithm='SAMME')

In [47]:
bdt.fit(train_X, train_Y)


Out[47]:
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [48]:
validation_Y = bdt.predict(train_X)
validation_Y


Out[48]:
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [49]:
(train_Y - validation_Y).abs().sum()


Out[49]:
16

In [50]:
result = bdt.predict(test_X)

In [66]:
result_pd = pd.DataFrame(data=result, index=range(892, 1310),  columns=['Survived'])

In [68]:
result_pd.index.name = 'PassengerId'

In [69]:
result_pd


Out[69]:
Survived
PassengerId
892 0
893 0
894 1
895 1
896 1
897 0
898 0
899 0
900 1
901 0
902 0
903 0
904 1
905 0
906 1
907 1
908 0
909 1
910 1
911 0
912 1
913 0
914 1
915 1
916 1
917 0
918 1
919 1
920 1
921 0
... ...
1280 0
1281 0
1282 0
1283 1
1284 0
1285 0
1286 0
1287 1
1288 0
1289 1
1290 0
1291 0
1292 1
1293 0
1294 1
1295 0
1296 0
1297 0
1298 0
1299 1
1300 1
1301 1
1302 1
1303 1
1304 0
1305 0
1306 1
1307 0
1308 0
1309 1

418 rows × 1 columns


In [53]:
result_pd.to_csv('adaboost.csv', index_label=['PassengerId'])

In [60]:
result_dt = pd.read_csv('my_result.csv', index_col='PassengerId')
result_dt.drop('0', axis=1)


Out[60]:
Survived
PassengerId
892 0
893 0
894 1
895 1
896 0
897 0
898 0
899 1
900 1
901 0
902 0
903 0
904 1
905 0
906 1
907 1
908 0
909 1
910 0
911 0
912 1
913 1
914 1
915 0
916 1
917 0
918 1
919 1
920 1
921 0
... ...
1280 1
1281 0
1282 1
1283 1
1284 0
1285 0
1286 0
1287 1
1288 0
1289 1
1290 0
1291 0
1292 1
1293 0
1294 1
1295 0
1296 0
1297 0
1298 0
1299 0
1300 1
1301 1
1302 1
1303 1
1304 1
1305 0
1306 1
1307 0
1308 0
1309 1

418 rows × 1 columns


In [55]:
result


Out[55]:
array([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1])

In [64]:



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-64-6c57d35a3def> in <module>()
----> 1 result_pd.index = 'PassengerId'

/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __setattr__(self, name, value)
   2369         try:
   2370             object.__getattribute__(self, name)
-> 2371             return object.__setattr__(self, name, value)
   2372         except AttributeError:
   2373             pass

pandas/src/properties.pyx in pandas.lib.AxisProperty.__set__ (pandas/lib.c:45002)()

/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _set_axis(self, axis, labels)
    423 
    424     def _set_axis(self, axis, labels):
--> 425         self._data.set_axis(axis, labels)
    426         self._clear_item_cache()
    427 

/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in set_axis(self, axis, new_labels)
   2564 
   2565     def set_axis(self, axis, new_labels):
-> 2566         new_labels = _ensure_index(new_labels)
   2567         old_len = len(self.axes[axis])
   2568         new_len = len(new_labels)

/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in _ensure_index(index_like, copy)
   6092             index_like = copy(index_like)
   6093 
-> 6094     return Index(index_like)
   6095 
   6096 

/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
    190                          **kwargs)
    191         elif data is None or np.isscalar(data):
--> 192             cls._scalar_data_error(data)
    193         else:
    194             if tupleize_cols and isinstance(data, list) and data and isinstance(data[0], tuple):

/Users/abnerzheng/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in _scalar_data_error(cls, data)
    334         raise TypeError(
    335             '{0}(...) must be called with a collection of some kind, {1} was '
--> 336             'passed'.format(cls.__name__, repr(data))
    337         )
    338 

TypeError: Index(...) must be called with a collection of some kind, 'PassengerId' was passed

In [70]:
result_dt - result_pd


Out[70]:
0 Survived
PassengerId
892 NaN 0
893 NaN 0
894 NaN 0
895 NaN 0
896 NaN -1
897 NaN 0
898 NaN 0
899 NaN 1
900 NaN 0
901 NaN 0
902 NaN 0
903 NaN 0
904 NaN 0
905 NaN 0
906 NaN 0
907 NaN 0
908 NaN 0
909 NaN 0
910 NaN -1
911 NaN 0
912 NaN 0
913 NaN 1
914 NaN 0
915 NaN -1
916 NaN 0
917 NaN 0
918 NaN 0
919 NaN 0
920 NaN 0
921 NaN 0
... ... ...
1280 NaN 1
1281 NaN 0
1282 NaN 1
1283 NaN 0
1284 NaN 0
1285 NaN 0
1286 NaN 0
1287 NaN 0
1288 NaN 0
1289 NaN 0
1290 NaN 0
1291 NaN 0
1292 NaN 0
1293 NaN 0
1294 NaN 0
1295 NaN 0
1296 NaN 0
1297 NaN 0
1298 NaN 0
1299 NaN -1
1300 NaN 0
1301 NaN 0
1302 NaN 0
1303 NaN 0
1304 NaN 1
1305 NaN 0
1306 NaN 0
1307 NaN 0
1308 NaN 0
1309 NaN 0

418 rows × 2 columns