In [2]:
import os
import numpy as np
import pandas as pd
filepath = '/Users/mac/Desktop/Kaggle_datasets/WSC_breast_cancer_FNA/'
filename = 'data.csv'
dfFull = pd.read_csv(os.path.join(filepath+filename))
dfFull
Out[2]:
In [3]:
dfFull.info()
In [4]:
#dfFull.dropna(axis=1, inplace=True) #直的方向丟棄Unamed:32那排廢物
In [5]:
dfFull.isnull().sum() #現在data通通都有了
Out[5]:
In [6]:
dfFull.describe()
Out[6]:
In [7]:
dfFull['diagnosis'] = dfFull['diagnosis'].map({'M': 1 ,'B': 0 }).astype(int)
dfFull['diagnosis'].head() #出錯了只好one_hot_encoding
Out[7]:
In [8]:
dfFull.dropna(axis=1, inplace=True)
In [9]:
dfFull.head()
Out[9]:
In [10]:
np.random.seed(42)
dfFull_shuffle = dfFull.iloc[np.random.permutation(len(dfFull))]
dfTrain = dfFull_shuffle.iloc[1:454, :]
dfTest = dfFull_shuffle.iloc[454: , :]
#拿掉id, diagnosis
train_feature = np.array(dfTrain[['radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst']])
train_label = np.array(dfTrain['diagnosis'])
#拿掉id, diagnosis
test_feature = np.array(dfTest[['radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst']])
test_label = np.array(dfTest['diagnosis'])
In [11]:
len(train_feature)
Out[11]:
In [12]:
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_feature = minmax_scale.fit_transform(train_feature)
test_feature = minmax_scale.fit_transform(test_feature)
print(train_feature[0])
print(train_label[0])
print(test_feature[0])
print(test_label[0])
In [13]:
dfTrain.columns #方便複製欲選取的欄位
Out[13]:
In [14]:
train_feature.shape #確認餵給的參數多少
Out[14]:
In [57]:
from sklearn.model_selection import train_test_split
Newtrain_feature, Val_feature, Newtrain_label, Val_label = train_test_split(train_feature,
train_label,
test_size=0.2, # 0.2 test; 0.8 train
random_state=40) # seed,這樣才會都使用同一組train_data
In [15]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
model = Sequential() #一層一層到底,按順序
#輸入層(隱藏層1)
model.add(Dense(units=1000,
input_dim=30,
kernel_initializer='uniform',
activation='relu'))
#使用dropout避免overfitting
model.add(Dropout(0.5))
#隱藏層2,不用寫input_dim,因為就是前一層的units
model.add(Dense(units=500,
kernel_initializer='uniform',
activation='relu'))
#使用dropout避免overfitting
model.add(Dropout(0.5))
model.add(Dense(units=250,
kernel_initializer='uniform',
activation='relu'))
#使用dropout避免overfitting
model.add(Dropout(0.5))
#輸出層
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
activation='sigmoid'))
print(model.summary()) #可以清楚看到model還有參數數量
######################### 訓練模型
#選擇loss度量,optimizer學習路徑速度、
model.compile(loss='binary_crossentropy', #二元用binary
optimizer='adam', metrics=['accuracy'])
#開始train,並且記錄情況(設有val項以免overfitting)
train_history=model.fit(x=train_feature, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.2, epochs=20, batch_size=50, verbose=2) #verbose=2表示顯示訓練過程
######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
#儲存訓練結果
model.save_weights("Savemodels/BRCA_FNA(Kaggles)_MLP.h5")
print('model saved to disk')
In [16]:
scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])
######################### 紀錄模型預測情形(答案卷)
prediction = model.predict_classes(test_feature)
In [43]:
from collections import Counter
In [17]:
prediction_train = model.predict_classes(train_feature)
In [34]:
df_prediction_train = pd.DataFrame(prediction_train, index=range(len(prediction_train)))
df_prediction_train['prediction'] = df_prediction_train
df_train_label = pd.DataFrame(train_label)
df_prediction_train['diagnosis'] = df_train_label
df_prediction_train = df_prediction_train[['prediction', 'diagnosis']]
In [35]:
df_prediction_train
Out[35]:
In [45]:
df_prediction_train[ df_prediction_train['diagnosis'] != df_prediction_train['prediction']]
Out[45]:
In [51]:
df_prediction_train['diagnosis'].value_counts()
Out[51]:
In [48]:
len(df_prediction_train['diagnosis']==1)
Out[48]:
In [18]:
df_prediction = pd.DataFrame(prediction, index=range(len(prediction)))
df_prediction['prediction'] = df_prediction
df_test_label = pd.DataFrame(test_label)
df_prediction['diagnosis'] = df_test_label
In [19]:
df_prediction = df_prediction[['prediction', 'diagnosis']]
In [33]:
df_prediction
Out[33]:
In [50]:
df_prediction['diagnosis'].value_counts()
Out[50]:
In [39]:
df_prediction[df_prediction['diagnosis'] != df_prediction['prediction']]
Out[39]:
In [21]:
len(df_prediction)
Out[21]:
In [22]:
plt.hist(dfFull['diagnosis'])
plt.show()
In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [24]:
dfTrain['diagnosis'].value_counts()
Out[24]:
In [25]:
plt.hist(dfTrain['diagnosis'])
Out[25]:
In [26]:
corrmat = dfTrain.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True, cmap='rainbow')
Out[26]:
In [27]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'diagnosis')['diagnosis'].index
cm = np.corrcoef(dfTrain[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values)
plt.show()
In [28]:
#scatterplot
sns.set()
cols = ['diagnosis', 'concave points_worst', 'concave points_mean', 'concavity_mean', 'concavity_worst', 'perimeter_worst', 'radius_worst']
sns.pairplot(dfTrain[cols], size = 2.5)
plt.show();
In [17]:
dfTest['diagnosis'].value_counts()
Out[17]:
In [55]:
plt.hist(dfTest['diagnosis'])
Out[55]:
In [47]:
corrmat = dfTest.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True, cmap='rainbow')
Out[47]:
In [29]:
corrmat = dfTest.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True, cmap='rainbow', mask= corrmat<0.7)
Out[29]:
In [30]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'diagnosis')['diagnosis'].index
cm = np.corrcoef(dfTest[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values)
plt.show()
In [31]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'diagnosis')['diagnosis'].index
cm = np.corrcoef(dfTest[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values, mask=cm<0.75)
plt.show()
In [32]:
#scatterplot
sns.set()
cols = ['diagnosis', 'concave points_worst', 'concave points_mean', 'concavity_mean', 'concavity_worst', 'perimeter_worst', 'radius_worst']
sns.pairplot(dfTest[cols], size = 2.5)
plt.show();
In [ ]: