In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# We resort to a third party library to plot silhouette diagrams
! pip install yellowbrick
from yellowbrick.cluster import SilhouetteVisualizer
Goal: Find fraudolent credit card transactions
Dataset:
In [35]:
! wget https://datahub.io/machine-learning/creditcard/r/creditcard.csv
In [36]:
df = pd.read_csv('creditcard.csv')
df.head()
Out[36]:
In [37]:
df.info(verbose=True)
In [38]:
df['Class'].value_counts()
Out[38]:
The anomalies are the minority.
Remove the time, since it has no meaning for discovering anomalies for us.
In [0]:
df = df.drop('Time', axis=1)
In unsupervised approaches, the label is not used
In [0]:
X = df.drop('Class', axis=1)
All the methods we will use, except iForests, performs best if the dataset is scaled
In [0]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [0]:
K =3
model = KMeans(n_clusters=K, random_state=3)
clusters = model.fit_predict(X_scaled)
The array clusters
contains the cluster id of each sample
In [43]:
clusters[0:5]
Out[43]:
Check how many elements per cluster
In [44]:
plt.hist(clusters)
Out[44]:
In [45]:
# Inspired by https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
fig, (ax1) = plt.subplots()
# The silhouette coefficient can range from -1, 1
ax1.set_xlim([-1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (K + 1) * 10])
Out[45]:
If you try to compute the silhouette score with ordinary sklearn
functions, it is extremely slow.
Recall that you need to compute the distances between all samples, i.e.
In [46]:
print("Distances to be computed: ", "{:e}".format( X_scaled.shape[0]**2) )
We will thus used an alternative implementation from Alexandre Abraham.
In [47]:
! wget https://gist.githubusercontent.com/AlexandreAbraham/5544803/raw/221aa797cdbfa9e9f75fc0aabb2322dcc11c8991/unsupervised_alt.py
import unsupervised_alt
In [0]:
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X_scaled, clusters)
In [0]:
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X_scaled, clusters)
In [0]: