Fraud detection is one of the earliest industrial applications of data mining and machine learning. Fraud detection is typically handled as a binary classification problem, but the class population is unbalanced because instances of fraud are usually very rare compared to the overall volume of transactions. Moreover, when fraudulent transactions are discovered, the business typically takes measures to block the accounts from transacting to prevent further losses.
In [1]:
import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/fraud_detection.csv.zip', 'r') as z:
f = z.open('15_fraud_detection.csv')
data = pd.io.parsers.read_table(f, index_col=0, sep=',')
data.head()
Out[1]:
In [2]:
X = data.drop(['Label'], axis=1)
y = data['Label']
y.value_counts(normalize=True)
Out[2]:
In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
In [11]:
models = {'lr': LogisticRegression(),
'dt': DecisionTreeClassifier(),
'nb': GaussianNB(),
'nn': KNeighborsClassifier()}
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
# Train all the models
for model in models.keys():
models[model].fit(X_train, y_train)
In [12]:
# predict test for each model
y_pred = pd.DataFrame(index=X_test.index, columns=models.keys())
for model in models.keys():
y_pred[model] = models[model].predict(X_test)
y_pred.sample(10)
Out[12]:
In [13]:
y_pred_ensemble1 = (y_pred.mean(axis=1) > 0.5).astype(int)
In [14]:
y_pred_ensemble1.mean()
Out[14]:
In [15]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
In [16]:
stats = {'acc': accuracy_score,
'f1': f1_score,
'rec': recall_score,
'pre': precision_score}
res = pd.DataFrame(index=models.keys(), columns=stats.keys())
In [17]:
for model in models.keys():
for stat in stats.keys():
res.loc[model, stat] = stats[stat](y_test, y_pred[model])
In [18]:
res
Out[18]:
In [19]:
res.loc['ensemble1'] = 0
for stat in stats.keys():
res.loc['ensemble1', stat] = stats[stat](y_test, y_pred_ensemble1)
In [20]:
res
Out[20]:
In [ ]:
In [ ]:
In [ ]: