In [1]:
import h5py
import pandas as pd
import numpy as np
np.random.seed(1)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
plt.style.use("ggplot")
%matplotlib inline
from __future__ import division
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from keras import backend as K
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping
from IPython.display import clear_output
In [2]:
data = pd.read_csv("data/creditcard.csv")
In [3]:
data.head()
Out[3]:
In [4]:
data.describe()
Out[4]:
In [5]:
data.isnull().sum()
Out[5]:
In [6]:
data.Time[data.Class == 1].describe()
Out[6]:
In [7]:
data.Time[data.Class == 0].describe()
Out[7]:
In [8]:
data.Class.value_counts(normalize=True)
Out[8]:
In [9]:
data.Class.value_counts(normalize=True).plot(kind="bar", logy=True, figsize=(8, 6), fontsize=15)
pass
In [10]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 4))
data.Amount[data.Class == 1].hist(ax=ax1, bins=100)
data.Amount[data.Class == 0].hist(ax=ax2, bins=100, log=True)
ax1.set_title("Fraud")
ax2.set_title("Normal")
plt.xlabel("Amount")
plt.ylabel("#Transactions")
pass
In [11]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))
data.Time[data.Class == 1].hist(ax=ax1, bins=100)
data.Time[data.Class == 0].hist(ax=ax2, bins=100, log=True)
ax1.set_title("Fraud")
ax2.set_title("Normal")
plt.xlabel("Time (seconds)")
plt.ylabel("#Transactions")
pass
In [12]:
plt.hist(data.V1[data.Class == 1], 50, alpha=0.3, label="Fraud")
plt.hist(data.V1[data.Class == 0], 50, alpha=0.5, label="Normal", log=True)
plt.legend(loc='upper right')
pass
In [13]:
plt.figure(figsize=(15, 40))
gs = gridspec.GridSpec(14, 2, hspace=0.8)
bins = 30
for i, j in enumerate(data.columns[1:29]):
ax = plt.subplot(gs[i])
sns.distplot(data[j][data.Class == 1], bins=bins, label="Fraud")
sns.distplot(data[j][data.Class == 0], bins=bins, label="Normal")
ax.set_title("Density distribution for {}".format(j))
ax.legend(loc="upper right")
pass
In [14]:
EPOCHS = 20
BATCH_SIZE = 32
PATIENCE = 4
N_SPLIT_SIZE = 10
V_SPLIT_NN = 0.2
T_SPLIT_RS = 0.2
In [15]:
def evaluation(y_true, y_pred):
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
return acc, prec, rec
In [16]:
def neural_network(data):
model = Sequential()
model.add(Dense(256, activation="sigmoid", input_dim=data.shape[1]))
model.add(Dense(128, activation="sigmoid"))
model.add(Dense(64, activation="sigmoid"))
model.add(Dense(32, activation="sigmoid"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="rmsprop",
loss="binary_crossentropy",
metrics=["accuracy"])
return model
In [17]:
def run(model, index, X_train, X_test, y_train, y_test):
early_stopping = EarlyStopping(monitor="loss", patience=PATIENCE)
model_callback = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
validation_split=V_SPLIT_NN, callbacks=[early_stopping], verbose=1)
model.save("models/model_{i}.h5".format(i=index))
y_test_score = model.predict(X_test)
y_test_pred = model.predict_classes(X_test)
acc, prec, rec = evaluation(y_test, y_test_pred)
auc = roc_auc_score(y_test, y_test_score)
return acc, prec, rec, auc, model_callback.history["loss"], model_callback.history["val_loss"]
In [18]:
# Oversample
sm = SMOTE(random_state=0)
X, y = data[data.columns[1:29]].values, data.Class.values
X_res, y_res = sm.fit_sample(X, y)
In [19]:
pd.Series(y_res).value_counts()
Out[19]:
In [20]:
nn = neural_network(X_res)
In [21]:
nn.summary()
In [22]:
skf = StratifiedKFold(n_splits=N_SPLIT_SIZE, random_state=0)
results = []
for index, (train_index, test_index) in enumerate(skf.split(X_res, y_res)):
clear_output()
print("Run iteration: {i}".format(i=index))
X_train, X_test = X_res[train_index], X_res[test_index]
y_train, y_test = y_res[train_index], y_res[test_index]
results.append(run(neural_network(X_train), index, X_train, X_test, y_train, y_test))
In [38]:
metrics = [i[0:4] for i in results]
metrics_df = pd.DataFrame(metrics, columns=["Accuracy", "Precision", "Recall", "AUC"])
In [24]:
# take average of n runs
metrics_df.mean()
Out[24]:
In [52]:
metrics_df.plot(figsize=(10, 8), fontsize=15, yticks=np.arange(0.985, 1, 0.001))
plt.xlabel("Iteration", fontsize=15)
plt.ylabel("Score", fontsize=15)
plt.legend(loc="lower left", fontsize=15)
pass
In [26]:
# save it to csv
pd.DataFrame(results).to_csv("result.csv", header=False)
In [27]:
# training loss
loss_list = []
for i in range(len(results)):
loss_list.append(pd.Series([j[4] for j in results][i]))
In [28]:
pd.concat(loss_list, axis=1).mean(axis=1).plot(xlim=[0, 20], xticks=range(21), yticks=np.arange(0, 0.13, 0.01),
figsize=(8, 6), fontsize=10)
plt.xlabel("Training Epoch", fontsize=10)
plt.ylabel("Average Loss", fontsize=10)
pass
In [29]:
# validation loss
loss_list = []
for i in range(len(results)):
loss_list.append(pd.Series([j[5] for j in results][i]))
In [30]:
pd.concat(loss_list, axis=1).mean(axis=1).plot(xlim=[0, 20], xticks=range(21), yticks=np.arange(0, 0.13,0.01),
figsize=(8, 6), fontsize=10)
plt.xlabel("Validation Epoch", fontsize=10)
plt.ylabel("Average Loss", fontsize=10)
pass
In [31]:
# training and validation loss
train_loss = []
val_loss = []
for i in range(len(results)):
train_loss.append(pd.Series([j[4] for j in results][i]))
val_loss.append(pd.Series([j[5] for j in results][i]))
In [32]:
train_loss_avg = pd.concat(train_loss, axis=1).mean(axis=1)
val_loss_avg = pd.concat(val_loss, axis=1).mean(axis=1)
train_and_val_loss = pd.concat([train_loss_avg, val_loss_avg], axis=1)
train_and_val_loss.rename(index=str, columns={0: "Training Loss", 1: "Validation Loss"}) \
.plot(xlim=[0, 20], xticks=range(21), yticks=np.arange(0, 0.11,0.01),
figsize=(10, 8), fontsize=15)
plt.xlabel("Epoch", fontsize=15)
plt.ylabel("Average Loss", fontsize=15)
plt.legend(loc="upper right", fontsize=15)
pass
In [33]:
metrics_df
Out[33]:
In [34]:
# Recall
np.argmax(np.array(metrics)[:,2])
Out[34]:
In [35]:
model = load_model("models/model_0.h5")
In [36]:
y_pred = model.predict_classes(X)
In [37]:
acc = accuracy_score(y, y_pred)
prec = precision_score(y, y_pred)
rec = recall_score(y, y_pred)
print(acc, prec, rec)
In [ ]: