In [77]:
import os
import numpy as np
import pandas as pd
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'train.csv'
df = pd.read_csv(os.path.join(filepath+filename))
df
Out[77]:
In [78]:
df.isnull().sum()
Out[78]:
In [79]:
cols = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
df = df[cols]
df.head()
Out[79]:
In [80]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Sex']= df['Sex'].map({'female':0, 'male': 1}).astype(int)
df = pd.get_dummies(data=df, columns=["Embarked"])
In [81]:
df.shape
Out[81]:
In [82]:
df.isnull().sum()
Out[82]:
告一段落,準備要製作feature和label
In [86]:
all_array = df.values
all_array
Out[86]:
In [92]:
train_feature_raw = all_array[:,1:]
train_label = all_array[:,0]
In [93]:
train_feature_raw[0:5]
Out[93]:
In [94]:
train_label[0:5]
Out[94]:
In [96]:
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_feature = minmax_scale.fit_transform(train_feature_raw)
train_feature[0:5]
Out[96]:
In [ ]:
def Preprocessingdata(raw_data):
df = pd.read_csv(raw_data)
cols = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
df = df[cols]
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Sex']= df['Sex'].map({'female':0, 'male': 1}).astype(int)
df = pd.get_dummies(data=df, columns=["Embarked"])
all_array = df.values
train_feature_raw = all_array[:,1:]
train_label = all_array[:,0]
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_feature = minmax_scale.fit_transform(train_feature_raw)
return train_feature, train_label
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'train.csv'
raw_data = os.path.join(filepath+filename)
In [104]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
model = Sequential() #一層一層到底,按順序
#輸入層(隱藏層1)
model.add(Dense(units=200,
input_dim=9,
kernel_initializer='uniform',
activation='relu'))
#使用dropout避免overfitting
model.add(Dropout(0.5))
#隱藏層2,不用寫input_dim,因為就是前一層的units
model.add(Dense(units=100,
kernel_initializer='uniform',
activation='relu'))
#使用dropout避免overfitting
model.add(Dropout(0.5))
#輸出層
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
activation='sigmoid'))
print(model.summary()) #可以清楚看到model還有參數數量
######################### 訓練模型
#選擇loss度量,optimizer學習路徑速度、
model.compile(loss='binary_crossentropy', #二元用binary
optimizer='adam', metrics=['accuracy'])
#開始train,並且記錄情況(設有val項以免overfitting)
train_history=model.fit(x=train_feature, y=train_label,
validation_split=0.2, epochs=20, batch_size=20, verbose=2) #verbose=2表示顯示訓練過程
######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
#儲存訓練結果
model.save_weights("Savemodel/Titanic(Kaggles)_MLP.h5")
print('model saved to disk')
In [105]:
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'test.csv'
df2 = pd.read_csv(os.path.join(filepath+filename))
df2.head()
Out[105]:
In [107]:
cols = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
df2 = df2[cols]
df2['Age'] = df2['Age'].fillna(df2['Age'].mean())
df2['Sex']= df2['Sex'].map({'female':0, 'male': 1}).astype(int)
df2 = pd.get_dummies(data=df2, columns=["Embarked"])
df2.head()
Out[107]:
In [109]:
df2.isnull().sum()
Out[109]:
In [110]:
df2['Fare'] = df2['Fare'].fillna(df2['Fare'].mean())
In [111]:
df2.isnull().sum()
Out[111]:
In [112]:
test_feature_raw = df2.values
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
test_feature = minmax_scale.fit_transform(test_feature_raw)
In [113]:
######################### 紀錄模型預測情形(答案卷)
prediction = model.predict_classes(test_feature)
In [115]:
prediction[0:5]
Out[115]:
In [123]:
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename = 'prediction.csv'
df3['Survived'] = pd.DataFrame(prediction)
df3.to_csv(os.path.join(filepath+filename))
In [124]:
df3.tail()
Out[124]:
In [ ]: