Test_data資料預處理


In [77]:
import os
import numpy as np
import pandas as pd

filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'train.csv'

df = pd.read_csv(os.path.join(filepath+filename))

df


Out[77]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S
11 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S
12 13 0 3 Saundercock, Mr. William Henry male 20.0 0 0 A/5. 2151 8.0500 NaN S
13 14 0 3 Andersson, Mr. Anders Johan male 39.0 1 5 347082 31.2750 NaN S
14 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 350406 7.8542 NaN S
15 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 NaN S
16 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 NaN Q
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
18 19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 345763 18.0000 NaN S
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C
20 21 0 2 Fynney, Mr. Joseph J male 35.0 0 0 239865 26.0000 NaN S
21 22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.0000 D56 S
22 23 1 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 330923 8.0292 NaN Q
23 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S
24 25 0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 349909 21.0750 NaN S
25 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 347077 31.3875 NaN S
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C
27 28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q
29 30 0 3 Todoroff, Mr. Lalio male NaN 0 0 349216 7.8958 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 2 Giles, Mr. Frederick Edward male 21.0 1 0 28134 11.5000 NaN S
862 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 17466 25.9292 D17 S
863 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female NaN 8 2 CA. 2343 69.5500 NaN S
864 865 0 2 Gill, Mr. John William male 24.0 0 0 233866 13.0000 NaN S
865 866 1 2 Bystrom, Mrs. (Karolina) female 42.0 0 0 236852 13.0000 NaN S
866 867 1 2 Duran y More, Miss. Asuncion female 27.0 1 0 SC/PARIS 2149 13.8583 NaN C
867 868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0 0 PC 17590 50.4958 A24 S
868 869 0 3 van Melkebeke, Mr. Philemon male NaN 0 0 345777 9.5000 NaN S
869 870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 347742 11.1333 NaN S
870 871 0 3 Balkic, Mr. Cerin male 26.0 0 0 349248 7.8958 NaN S
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 S
872 873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 695 5.0000 B51 B53 B55 S
873 874 0 3 Vander Cruyssen, Mr. Victor male 47.0 0 0 345765 9.0000 NaN S
874 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 NaN C
875 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.0 0 0 2667 7.2250 NaN C
876 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 7534 9.8458 NaN S
877 878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 349212 7.8958 NaN S
878 879 0 3 Laleff, Mr. Kristo male NaN 0 0 349217 7.8958 NaN S
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C
880 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 230433 26.0000 NaN S
881 882 0 3 Markun, Mr. Johann male 33.0 0 0 349257 7.8958 NaN S
882 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 7552 10.5167 NaN S
883 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NaN S
884 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S
885 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 NaN Q
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns


In [78]:
df.isnull().sum()


Out[78]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [79]:
cols = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
df = df[cols]
df.head()


Out[79]:
Survived Pclass Sex Age SibSp Parch Fare Embarked
0 0 3 male 22.0 1 0 7.2500 S
1 1 1 female 38.0 1 0 71.2833 C
2 1 3 female 26.0 0 0 7.9250 S
3 1 1 female 35.0 1 0 53.1000 S
4 0 3 male 35.0 0 0 8.0500 S

In [80]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Sex']= df['Sex'].map({'female':0, 'male': 1}).astype(int)
df = pd.get_dummies(data=df, columns=["Embarked"])


//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [81]:
df.shape


Out[81]:
(891, 10)

In [82]:
df.isnull().sum()


Out[82]:
Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

告一段落,準備要製作feature和label


In [86]:
all_array = df.values
all_array


Out[86]:
array([[ 0.,  3.,  1., ...,  0.,  0.,  1.],
       [ 1.,  1.,  0., ...,  1.,  0.,  0.],
       [ 1.,  3.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  3.,  0., ...,  0.,  0.,  1.],
       [ 1.,  1.,  1., ...,  1.,  0.,  0.],
       [ 0.,  3.,  1., ...,  0.,  1.,  0.]])

In [92]:
train_feature_raw = all_array[:,1:]
train_label = all_array[:,0]

In [93]:
train_feature_raw[0:5]


Out[93]:
array([[  3.    ,   1.    ,  22.    ,   1.    ,   0.    ,   7.25  ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   0.    ,  38.    ,   1.    ,   0.    ,  71.2833,
          1.    ,   0.    ,   0.    ],
       [  3.    ,   0.    ,  26.    ,   0.    ,   0.    ,   7.925 ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   0.    ,  35.    ,   1.    ,   0.    ,  53.1   ,
          0.    ,   0.    ,   1.    ],
       [  3.    ,   1.    ,  35.    ,   0.    ,   0.    ,   8.05  ,
          0.    ,   0.    ,   1.    ]])

In [94]:
train_label[0:5]


Out[94]:
array([ 0.,  1.,  1.,  1.,  0.])

In [96]:
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_feature = minmax_scale.fit_transform(train_feature_raw)

train_feature[0:5]


Out[96]:
array([[ 1.        ,  1.        ,  0.27117366,  0.125     ,  0.        ,
         0.01415106,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  0.4722292 ,  0.125     ,  0.        ,
         0.13913574,  1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.32143755,  0.        ,  0.        ,
         0.01546857,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  0.43453129,  0.125     ,  0.        ,
         0.1036443 ,  0.        ,  0.        ,  1.        ],
       [ 1.        ,  1.        ,  0.43453129,  0.        ,  0.        ,
         0.01571255,  0.        ,  0.        ,  1.        ]])

統整整個資料preprocessing過程(這樣才方便處理test data)


In [ ]:
def Preprocessingdata(raw_data):
    df = pd.read_csv(raw_data)
    cols = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
    df = df[cols]
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Sex']= df['Sex'].map({'female':0, 'male': 1}).astype(int)
    df = pd.get_dummies(data=df, columns=["Embarked"])
    
    all_array = df.values
    train_feature_raw = all_array[:,1:]
    train_label = all_array[:,0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    train_feature = minmax_scale.fit_transform(train_feature_raw) 
    
    return train_feature, train_label
    
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'train.csv'    
raw_data = os.path.join(filepath+filename)

跑模型囉!!


In [104]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()


######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

model = Sequential() #一層一層到底,按順序

#輸入層(隱藏層1)
model.add(Dense(units=200, 
                input_dim=9, 
                kernel_initializer='uniform', 
                activation='relu'))

#使用dropout避免overfitting
model.add(Dropout(0.5))

#隱藏層2,不用寫input_dim,因為就是前一層的units
model.add(Dense(units=100,  
                kernel_initializer='uniform', 
                activation='relu'))

#使用dropout避免overfitting
model.add(Dropout(0.5))


#輸出層
model.add(Dense(units=1, #輸出一個數字 
                kernel_initializer='uniform', 
                activation='sigmoid'))

print(model.summary()) #可以清楚看到model還有參數數量


######################### 訓練模型
#選擇loss度量,optimizer學習路徑速度、
model.compile(loss='binary_crossentropy',   #二元用binary
              optimizer='adam', metrics=['accuracy'])

#開始train,並且記錄情況(設有val項以免overfitting)
train_history=model.fit(x=train_feature, y=train_label,
                        validation_split=0.2, epochs=20, batch_size=20, verbose=2) #verbose=2表示顯示訓練過程


######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')

#儲存訓練結果
model.save_weights("Savemodel/Titanic(Kaggles)_MLP.h5")
print('model saved to disk')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_24 (Dense)             (None, 200)               2000      
_________________________________________________________________
dropout_17 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_18 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 101       
=================================================================
Total params: 22,201
Trainable params: 22,201
Non-trainable params: 0
_________________________________________________________________
None
Train on 712 samples, validate on 179 samples
Epoch 1/20
0s - loss: 0.6652 - acc: 0.6053 - val_loss: 0.5932 - val_acc: 0.6425
Epoch 2/20
0s - loss: 0.5942 - acc: 0.6770 - val_loss: 0.5142 - val_acc: 0.8324
Epoch 3/20
0s - loss: 0.5319 - acc: 0.7612 - val_loss: 0.4554 - val_acc: 0.8045
Epoch 4/20
0s - loss: 0.5035 - acc: 0.7837 - val_loss: 0.4366 - val_acc: 0.8212
Epoch 5/20
0s - loss: 0.4837 - acc: 0.8006 - val_loss: 0.4224 - val_acc: 0.8212
Epoch 6/20
0s - loss: 0.4718 - acc: 0.7935 - val_loss: 0.4152 - val_acc: 0.8212
Epoch 7/20
0s - loss: 0.4666 - acc: 0.7963 - val_loss: 0.4118 - val_acc: 0.8324
Epoch 8/20
0s - loss: 0.4663 - acc: 0.7992 - val_loss: 0.4091 - val_acc: 0.8436
Epoch 9/20
0s - loss: 0.4644 - acc: 0.7992 - val_loss: 0.4052 - val_acc: 0.8324
Epoch 10/20
0s - loss: 0.4602 - acc: 0.7992 - val_loss: 0.3992 - val_acc: 0.8380
Epoch 11/20
0s - loss: 0.4596 - acc: 0.7992 - val_loss: 0.4001 - val_acc: 0.8380
Epoch 12/20
0s - loss: 0.4526 - acc: 0.8132 - val_loss: 0.3954 - val_acc: 0.8436
Epoch 13/20
0s - loss: 0.4645 - acc: 0.8076 - val_loss: 0.3925 - val_acc: 0.8324
Epoch 14/20
0s - loss: 0.4510 - acc: 0.7921 - val_loss: 0.3905 - val_acc: 0.8436
Epoch 15/20
0s - loss: 0.4504 - acc: 0.8090 - val_loss: 0.4008 - val_acc: 0.8324
Epoch 16/20
0s - loss: 0.4628 - acc: 0.8048 - val_loss: 0.3884 - val_acc: 0.8436
Epoch 17/20
0s - loss: 0.4402 - acc: 0.8230 - val_loss: 0.3892 - val_acc: 0.8380
Epoch 18/20
0s - loss: 0.4496 - acc: 0.8048 - val_loss: 0.3866 - val_acc: 0.8436
Epoch 19/20
0s - loss: 0.4535 - acc: 0.8020 - val_loss: 0.3865 - val_acc: 0.8436
Epoch 20/20
0s - loss: 0.4393 - acc: 0.8118 - val_loss: 0.3852 - val_acc: 0.8436
model saved to disk

test_data資料預處理,完畢後就可以正式考試了


In [105]:
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'test.csv'

df2 = pd.read_csv(os.path.join(filepath+filename))

df2.head()


Out[105]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

In [107]:
cols = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
df2 = df2[cols]
df2['Age'] = df2['Age'].fillna(df2['Age'].mean())
df2['Sex']= df2['Sex'].map({'female':0, 'male': 1}).astype(int)
df2 = pd.get_dummies(data=df2, columns=["Embarked"])
df2.head()


//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[107]:
Pclass Sex Age SibSp Parch Fare Embarked_C Embarked_Q Embarked_S
0 3 1 34.5 0 0 7.8292 0 1 0
1 3 0 47.0 1 0 7.0000 0 0 1
2 2 1 62.0 0 0 9.6875 0 1 0
3 3 1 27.0 0 0 8.6625 0 0 1
4 3 0 22.0 1 1 12.2875 0 0 1

In [109]:
df2.isnull().sum()


Out[109]:
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          1
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [110]:
df2['Fare'] = df2['Fare'].fillna(df2['Fare'].mean())

In [111]:
df2.isnull().sum()


Out[111]:
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [112]:
test_feature_raw = df2.values

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
test_feature = minmax_scale.fit_transform(test_feature_raw)

正式考試預測


In [113]:
######################### 紀錄模型預測情形(答案卷)
prediction = model.predict_classes(test_feature)


 32/418 [=>............................] - ETA: 2s

In [115]:
prediction[0:5]


Out[115]:
array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int32)

In [123]:
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'prediction.csv'

df3['Survived'] = pd.DataFrame(prediction)

df3.to_csv(os.path.join(filepath+filename))

In [124]:
df3.tail()


Out[124]:
0 Survived
413 0 0
414 1 1
415 0 0
416 0 0
417 0 0

In [ ]: