In [1]:
from preamble import *
%matplotlib notebook
In [2]:
data = pd.read_csv("data/german_cc_fraud.csv")
In [3]:
data['class'].value_counts()
Out[3]:
In [4]:
data.head()
Out[4]:
In [5]:
data_dummies = pd.get_dummies(data.drop("class", axis=1))
In [6]:
data_dummies.columns
Out[6]:
In [7]:
X = data_dummies.values.astype(np.float)
In [11]:
X.shape
Out[11]:
In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.covariance import EllipticEnvelope
In [9]:
X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=.8)
X_preprocessed = pca.fit_transform(X_scaled)
In [10]:
pca.n_components_
Out[10]:
In [12]:
ee = EllipticEnvelope(contamination=.3).fit(X_preprocessed)
In [15]:
np.bincount(ee.predict(X_preprocessed) + 1)
Out[15]:
In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(data['class'] == "good", ee.predict(X_preprocessed) == 1)
Out[16]:
In [17]:
from sklearn.metrics import roc_auc_score
roc_auc_score(data['class'] == "good", ee.decision_function(X_preprocessed))
Out[17]:
In [19]:
pca_full = PCA().fit(X_scaled)
plt.figure()
plt.plot(pca_full.explained_variance_ratio_)
Out[19]:
In [16]:
roc_auc_score(data['class'] == "good", pca.score_samples(X_scaled))
Out[16]:
In [20]:
from robust_pca import RobustPCA
rpca = RobustPCA().fit(X_scaled)
In [21]:
roc_auc_score(data['class'] == "good", rpca.score_samples(X_scaled))
Out[21]:
In [22]:
from sklearn.neighbors import KernelDensity
In [23]:
kde = KernelDensity(bandwidth=5).fit(X_scaled)
In [24]:
plt.figure()
plt.hist(kde.score_samples(X_scaled), bins=100);
In [25]:
roc_auc_score(data['class'] == "good", kde.score_samples(X_scaled))
Out[25]:
In [26]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=.3).fit(data_dummies.values)
In [27]:
from sklearn.metrics import confusion_matrix, roc_auc_score
confusion_matrix(data['class'] == "good", iso.predict(data_dummies.values) == 1)
Out[27]:
In [28]:
roc_auc_score(data['class'] == "good", iso.decision_function(data_dummies.values))
Out[28]: