In [171]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\
,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\
,roc_curve
In [132]:
import numpy as np
import pandas as pd
import os
filepath = '/Users/mac/Desktop/Kaggle_datasets/Creditcardfraud/'
filename01 = 'creditcard.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
In [133]:
df_full.head()
Out[133]:
In [134]:
df_full.info()
In [135]:
df_full.Class.value_counts() ##可以看到positive比例超級低約莫0.2%,直接跑DL會有偵測不到positive全部猜negative的危險
Out[135]:
In [136]:
df_full.sort_values(by='Class', ascending=False, inplace=True)
df_full.drop('Time', axis=1, inplace = True)
In [137]:
df_full.head() ##記住,第index值492以後都是false,方便我們分層抽樣
Out[137]:
In [138]:
df_sample = df_full.iloc[:3000,:]
df_sample.Class.value_counts()
Out[138]:
In [139]:
feature = np.array(df_sample.values[:,0:29])
label = np.array(df_sample.values[:,-1])
In [140]:
from sklearn.utils import shuffle
shuffle_df = shuffle(df_sample, random_state=42)
df_train = shuffle_df[0:2400]
df_test = shuffle_df[2400:]
In [141]:
train_feature = np.array(df_train.values[:,0:29])
train_label = np.array(df_train.values[:,-1])
test_feature = np.array(df_test.values[:,0:29])
test_label = np.array(df_test.values[:,-1])
In [142]:
train_feature[0]
Out[142]:
In [143]:
train_feature.shape
Out[143]:
In [144]:
train_label
Out[144]:
In [145]:
train_label.shape
Out[145]:
In [146]:
test_feature.shape
Out[146]:
In [147]:
test_label.shape
Out[147]:
In [148]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)
In [149]:
train_feature_trans
Out[149]:
In [150]:
######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential() #一層一層到底,按順序
#輸入層(隱藏層1)
model.add(Dense(units=200,
input_dim=29,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
#隱藏層2,不用寫input_dim,因為就是前一層的units
model.add(Dense(units=200,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
#輸出層
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
activation='sigmoid'))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='binary_crossentropy', #二元用binary
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature_trans, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=200,
batch_size=500, verbose=2) #verbose=2表示顯示訓練過程
######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
######################### 實際測驗得分
scores = model.evaluate(test_feature_trans, test_label)
print('\n')
print('accuracy=',scores[1])
######################### 紀錄模型預測情形(答案卷)
prediction = model.predict_classes(test_feature_trans)
#儲存訓練結果
#model.save_weights("Keras_CreditCardFraud_MLP.h5")
#print('model saved to disk')
In [151]:
df_ans = pd.DataFrame({'Real Class' :test_label})
df_ans['Prediction'] = prediction
In [152]:
df_ans[ df_ans['Real Class'] != df_ans['Prediction'] ]
Out[152]:
In [153]:
df_ans['Prediction'].value_counts() #分層之後,0和1參半,是好現象
Out[153]:
In [154]:
df_ans['Real Class'].value_counts()
Out[154]:
In [155]:
prediction_train = model.predict_classes(train_feature_trans)
In [156]:
df_train_ans = pd.DataFrame({'Real Class' :train_label})
df_train_ans['Prediction'] = prediction_train
In [157]:
df_train_ans[ df_train_ans['Real Class'] != df_train_ans['Prediction'] ]
Out[157]:
In [158]:
df_train_ans['Prediction'].value_counts()
Out[158]:
In [159]:
df_train_ans['Real Class'].value_counts()
Out[159]:
In [160]:
import seaborn as sns
%matplotlib inline
cols = ['Real_Class_1','Real_Class_0'] #Gold standard
rows = ['Prediction_1','Prediction_0'] #diagnostic tool (our prediction)
B1P1 = len(df_ans[(df_ans['Prediction'] == df_ans['Real Class']) & (df_ans['Real Class'] == 1)])
B1P0 = len(df_ans[(df_ans['Prediction'] != df_ans['Real Class']) & (df_ans['Real Class'] == 1)])
B0P1 = len(df_ans[(df_ans['Prediction'] != df_ans['Real Class']) & (df_ans['Real Class'] == 0)])
B0P0 = len(df_ans[(df_ans['Prediction'] == df_ans['Real Class']) & (df_ans['Real Class'] == 0)])
conf = np.array([[B1P1,B0P1],[B1P0,B0P0]])
df_cm = pd.DataFrame(conf, columns = [i for i in cols], index = [i for i in rows])
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(df_cm, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
print('total test case number: ', np.sum(conf))
In [161]:
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
model_efficacy(conf)
Out[161]:
In [220]:
df_sample2 = df_full.iloc[:,:] #由於都是label=0,就不shuffle了
feature2 = np.array(df_sample2.values[:,0:29])
label2 = np.array(df_sample2.values[:,-1])
feature2_trans = scaler.transform(feature2)
######################### 實際測驗得分
scores = model.evaluate(feature2_trans, label2)
print('\n')
print('accuracy=',scores[1])
######################### 紀錄模型預測情形(答案卷)
prediction2 = model.predict_classes(feature2_trans)
In [221]:
unique, counts = np.unique(prediction2, return_counts=True)
In [222]:
unique, counts ##結果相當不錯!!
Out[222]:
In [223]:
prediction2_list = prediction2.reshape(-1).astype(int)
In [224]:
label2_list = label2.astype(int)
In [225]:
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
In [226]:
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
In [227]:
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
model_efficacy(conf)
Out[227]: