In [1]:
import h5py
import pandas as pd
import numpy as np
np.random.seed(1)

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
plt.style.use("ggplot")
%matplotlib inline

from __future__ import division
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

from keras import backend as K
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping
from IPython.display import clear_output


Using TensorFlow backend.

In [2]:
data = pd.read_csv("data/creditcard.csv")

In [3]:
EPOCHS = 20
BATCH_SIZE = 32
PATIENCE = 4
N_SPLIT_SIZE = 10
V_SPLIT_NN = 0.2
T_SPLIT_RS = 0.2

In [4]:
def evaluation(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    return acc, prec, rec

In [5]:
def neural_network(data):
    model = Sequential()
    model.add(Dense(256, activation="sigmoid", input_dim=data.shape[1]))
    model.add(Dense(128, activation="sigmoid"))
    model.add(Dense(64, activation="sigmoid"))
    model.add(Dense(32, activation="sigmoid"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model

In [6]:
def run(model, X_train, X_test, y_train, y_test):
    early_stopping = EarlyStopping(monitor="loss", patience=PATIENCE)
    model_callback = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, 
                               validation_split=V_SPLIT_NN, callbacks=[early_stopping], verbose=1)
    y_test_score = model.predict(X_test)
    y_test_pred = model.predict_classes(X_test)
    acc, prec, rec = evaluation(y_test, y_test_pred)
    auc = roc_auc_score(y_test, y_test_score)
    return acc, prec, rec, auc, model_callback.history["loss"], model_callback.history["val_loss"]

In [7]:
# Oversample
sm = SMOTE(random_state=0)
X, y = data[data.columns[1:29]].values, data.Class.values
X_res, y_res = sm.fit_sample(X, y)

In [8]:
nn = neural_network(X_res)

In [9]:
nn.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 256)               7424      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
=================================================================
Total params: 50,689
Trainable params: 50,689
Non-trainable params: 0
_________________________________________________________________

In [10]:
kf = KFold(n_splits=N_SPLIT_SIZE, random_state=0)

results = []
for index, (train_index, test_index) in enumerate(kf.split(X_res)):
    clear_output()
    print("Run iteration: {i}".format(i=index))
    X_train, X_test = X_res[train_index], X_res[test_index]
    y_train, y_test = y_res[train_index], y_res[test_index]
    results.append(run(neural_network(X_train), X_train, X_test, y_train, y_test))


Run iteration: 6
Train on 409413 samples, validate on 102354 samples
Epoch 1/20
409413/409413 [==============================] - 33s - loss: 0.0862 - acc: 0.9689 - val_loss: 0.0970 - val_acc: 0.9591
Epoch 2/20
409413/409413 [==============================] - 33s - loss: 0.0348 - acc: 0.9883 - val_loss: 0.0251 - val_acc: 0.9913
Epoch 3/20
409413/409413 [==============================] - 33s - loss: 0.0215 - acc: 0.9938 - val_loss: 0.0089 - val_acc: 0.9972
Epoch 4/20
409413/409413 [==============================] - 33s - loss: 0.0168 - acc: 0.9958 - val_loss: 0.0409 - val_acc: 0.9879
Epoch 5/20
409413/409413 [==============================] - 33s - loss: 0.0145 - acc: 0.9965 - val_loss: 6.0314e-04 - val_acc: 0.9998
Epoch 6/20
409413/409413 [==============================] - 33s - loss: 0.0135 - acc: 0.9968 - val_loss: 0.0120 - val_acc: 0.9958
Epoch 7/20
409413/409413 [==============================] - 33s - loss: 0.0125 - acc: 0.9970 - val_loss: 0.0021 - val_acc: 0.9994
Epoch 8/20
409413/409413 [==============================] - 34s - loss: 0.0117 - acc: 0.9973 - val_loss: 0.0023 - val_acc: 0.9993
Epoch 9/20
409413/409413 [==============================] - 33s - loss: 0.0113 - acc: 0.9976 - val_loss: 0.0029 - val_acc: 0.9995
Epoch 10/20
409413/409413 [==============================] - 33s - loss: 0.0107 - acc: 0.9977 - val_loss: 0.0133 - val_acc: 0.9964
Epoch 11/20
409413/409413 [==============================] - 34s - loss: 0.0102 - acc: 0.9980 - val_loss: 0.0027 - val_acc: 0.9992
Epoch 12/20
409413/409413 [==============================] - 33s - loss: 0.0100 - acc: 0.9980 - val_loss: 0.0011 - val_acc: 0.9996
Epoch 13/20
409413/409413 [==============================] - 33s - loss: 0.0096 - acc: 0.9982 - val_loss: 0.0039 - val_acc: 0.9987
Epoch 14/20
409413/409413 [==============================] - 33s - loss: 0.0096 - acc: 0.9981 - val_loss: 0.0019 - val_acc: 0.9994
Epoch 15/20
409413/409413 [==============================] - 34s - loss: 0.0092 - acc: 0.9982 - val_loss: 0.0039 - val_acc: 0.9987
Epoch 16/20
409413/409413 [==============================] - 34s - loss: 0.0093 - acc: 0.9982 - val_loss: 7.0225e-04 - val_acc: 0.9999
Epoch 17/20
409413/409413 [==============================] - 34s - loss: 0.0095 - acc: 0.9983 - val_loss: 5.4792e-04 - val_acc: 0.9999
Epoch 18/20
409413/409413 [==============================] - 35s - loss: 0.0099 - acc: 0.9982 - val_loss: 0.0052 - val_acc: 0.9983
Epoch 19/20
409413/409413 [==============================] - 34s - loss: 0.0093 - acc: 0.9984 - val_loss: 0.0035 - val_acc: 0.9984
Epoch 20/20
409413/409413 [==============================] - 35s - loss: 0.0098 - acc: 0.9984 - val_loss: 1.4852e-04 - val_acc: 0.9999
56416/56863 [============================>.] - ETA: 0s
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-f80cb3539263> in <module>()
      7     X_train, X_test = X_res[train_index], X_res[test_index]
      8     y_train, y_test = y_res[train_index], y_res[test_index]
----> 9     results.append(run(neural_network(X_train), X_train, X_test, y_train, y_test))

<ipython-input-6-bacca2a1248e> in run(model, X_train, X_test, y_train, y_test)
      6     y_test_pred = model.predict_classes(X_test)
      7     acc, prec, rec = evaluation(y_test, y_test_pred)
----> 8     auc = roc_auc_score(y_test, y_test_score)
      9     return acc, prec, rec, auc, model_callback.history["loss"], model_callback.history["val_loss"]

/Users/datitran/anaconda/envs/fraud-detection/lib/python2.7/site-packages/sklearn/metrics/ranking.pyc in roc_auc_score(y_true, y_score, average, sample_weight)
    258     return _average_binary_score(
    259         _binary_roc_auc_score, y_true, y_score, average,
--> 260         sample_weight=sample_weight)
    261 
    262 

/Users/datitran/anaconda/envs/fraud-detection/lib/python2.7/site-packages/sklearn/metrics/base.pyc in _average_binary_score(binary_metric, y_true, y_score, average, sample_weight)
     82 
     83     if y_type == "binary":
---> 84         return binary_metric(y_true, y_score, sample_weight=sample_weight)
     85 
     86     check_consistent_length(y_true, y_score, sample_weight)

/Users/datitran/anaconda/envs/fraud-detection/lib/python2.7/site-packages/sklearn/metrics/ranking.pyc in _binary_roc_auc_score(y_true, y_score, sample_weight)
    249     def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
    250         if len(np.unique(y_true)) != 2:
--> 251             raise ValueError("Only one class present in y_true. ROC AUC score "
    252                              "is not defined in that case.")
    253 

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: