Try PyOD

Basic experiments to observe the performance of PyOD anomalies detection



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib.font_manager

2D Simulated Data

Small data amount, just 200 records
2 features - 2D
Besides calculate the anomaly score for each record among the 200 records, each algorithm also needs to calculate all the points on meshgrid (200*200 points) - from here we can also see whether a PyOD implementation works well for large amount of records



In [75]:

    
from pyod.utils.data import generate_data, get_outliers_inliers

outlier_fraction = 0.01
total_records = 200

# 5% outliers, better to set random_state, otherwise it's difficult to differentiate inliners and outliers in plot
X_train, y_train = generate_data(n_train=total_records,train_only=True, n_features=2, contamination=outlier_fraction,
                                 random_state = 10)
x_outliers, x_inliers = get_outliers_inliers(X_train, y_train)

n_inliers = len(x_inliers)
n_outliers = len(x_outliers)

print(n_inliers, n_outliers)



In [76]:

    
print(X_train[:10])
print(y_train[:10])
print(np.unique(y_train, return_counts=True))









    



[[ 8.12134237  8.91658164]
 [ 9.08988677  9.35806311]
 [ 8.72853316  8.73078769]
 [ 9.13545602  8.74236303]
 [ 8.83968938  9.30961464]
 [ 8.95916181  9.0528698 ]
 [ 9.09318097  9.51839971]
 [ 8.9278589   8.69203977]
 [ 9.12710325  9.42224982]
 [ 9.13189251  8.87163196]]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
(array([ 0.,  1.]), array([198,   2]))



In [77]:

    
# 1 features
f1 = X_train[:,[0]]
f2 = X_train[:,[1]]



In [78]:

    
# The cluster at top right corner is inliners, others (10 points) are outliers
plt.scatter(f1,f2)
plt.xlabel('Feature1')
plt.ylabel('Feature2') 
plt.show()



In [79]:

    
# create a meshgrid 
## 200 points, the ploted meshgrid has both x-axis, y-axis in [-10, 10] range
xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))
xx









    Out[79]:





array([[-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       ..., 
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ]])



In [80]:

    
# check some data structures used below
print(xx.shape)
print(xx.ravel().shape)  # ravel() method convert 200*200 matrix into a 40000 list
xx.ravel()









    



(200, 200)
(40000,)






    Out[80]:





array([-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
         9.89949749,  10.        ])



In [81]:

    
# check some data structures used below
print(np.c_[xx.ravel(), yy.ravel()].shape)
np.c_[xx.ravel(), yy.ravel()]









    



(40000, 2)






    Out[81]:





array([[-10.        , -10.        ],
       [ -9.89949749, -10.        ],
       [ -9.79899497, -10.        ],
       ..., 
       [  9.79899497,  10.        ],
       [  9.89949749,  10.        ],
       [ 10.        ,  10.        ]])



In [82]:

    
# import models 
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.lscp import LSCP

random_state = 10

classifiers = {# Linear Models
               'One-class SVM (OCSVM)': OCSVM(contamination=outlier_fraction),
               'Minimum Covariance Determinant (MCD)': MCD(contamination=outlier_fraction, random_state=random_state),
               'Principal Component Analysis (PCA)': PCA(contamination=outlier_fraction, random_state=random_state),
               
               # Proximity-Based Models
               'Local Outlier Factor (LOF)': LOF(n_neighbors=40, contamination=outlier_fraction),
               'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outlier_fraction,
                                                  check_estimator=False, random_state=random_state),
               'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outlier_fraction),
               'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction),
               'Average KNN': KNN(method='mean', contamination=outlier_fraction),
    
               # Probabilistic Models
               'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction),
    
               # Outlier Ensembles
               'Feature Bagging': FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
                       contamination=outlier_fraction,
                       check_estimator=False,
                       random_state=random_state),
               'Isolation Forest': IForest(contamination=outlier_fraction,
                                random_state=random_state)
              }



In [83]:

    
# Fit the model
plt.figure(figsize=(20, 20))

for i, (clf_name, clf) in enumerate(classifiers.items()):
    print
    print(i + 1, 'fitting', clf_name)
    # fit the data and tag outliers
    clf.fit(X_train)
    scores_pred = clf.decision_function(X_train) * -1  # predicted anomaly score
    y_pred = clf.predict(X_train)  # predict whether inliner or outlier
    # threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
    threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
    n_errors = (y_pred != y_train).sum()
    print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')

    # plot the results
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1  # predicted anomaly score for each (x,y) point on meshgrid
    Z = Z.reshape(xx.shape)
    subplot = plt.subplot(4,4, i + 1)
    
    # fill blue colormap from minimum anomaly score to threshold value
    subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                         cmap=plt.cm.Blues_r)
    # draw red contour line where anomaly score is equal to threshold
    a = subplot.contour(xx, yy, Z, levels=[threshold],
                            linewidths=2, colors='red')
    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
    # inliner white dots; outlier black dots
    b = subplot.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white', s=20, edgecolor='k')
    c = subplot.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black', s=20, edgecolor='k')
    subplot.axis('tight')
    subplot.legend(
            [a.collections[0], b, c],
            ['learned decision function', 'true inliers', 'true outliers'],
            prop=matplotlib.font_manager.FontProperties(size=10),
            loc='lower right')
    subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
    subplot.set_xlim((-10, 10))
    subplot.set_ylim((-10, 10))
plt.show()









    



(1, 'fitting', 'Histogram-base Outlier Detection (HBOS)')
('Number of Errors : ', 'Histogram-base Outlier Detection (HBOS)', 2, 'Percentage of Errors: ', '1.0%')

(2, 'fitting', 'Local Outlier Factor (LOF)')
('Number of Errors : ', 'Local Outlier Factor (LOF)', 0, 'Percentage of Errors: ', '0.0%')

(3, 'fitting', 'One-class SVM (OCSVM)')
('Number of Errors : ', 'One-class SVM (OCSVM)', 0, 'Percentage of Errors: ', '0.0%')

(4, 'fitting', 'Principal Component Analysis (PCA)')
('Number of Errors : ', 'Principal Component Analysis (PCA)', 0, 'Percentage of Errors: ', '0.0%')

(5, 'fitting', 'Isolation Forest')
('Number of Errors : ', 'Isolation Forest', 0, 'Percentage of Errors: ', '0.0%')

(6, 'fitting', 'Angle-based Outlier Detector (ABOD)')
('Number of Errors : ', 'Angle-based Outlier Detector (ABOD)', 1, 'Percentage of Errors: ', '0.5%')

(7, 'fitting', 'K Nearest Neighbors (KNN)')
('Number of Errors : ', 'K Nearest Neighbors (KNN)', 0, 'Percentage of Errors: ', '0.0%')

(8, 'fitting', 'Feature Bagging')
('Number of Errors : ', 'Feature Bagging', 0, 'Percentage of Errors: ', '0.0%')

(9, 'fitting', 'Minimum Covariance Determinant (MCD)')
('Number of Errors : ', 'Minimum Covariance Determinant (MCD)', 0, 'Percentage of Errors: ', '0.0%')

(10, 'fitting', 'Cluster-based Local Outlier Factor (CBLOF)')
('Number of Errors : ', 'Cluster-based Local Outlier Factor (CBLOF)', 0, 'Percentage of Errors: ', '0.0%')

(11, 'fitting', 'Average KNN')
('Number of Errors : ', 'Average KNN', 0, 'Percentage of Errors: ', '0.0%')

Observations

When doing this experiment, I have tried 0.01 outlier fraction (2 outliers out of 200 records), and 0.05 (10 outliers out of 200 records). Because in real world practice, there are too many situations when you have less than 1% outliers. Based on this small set of data experiments, 0.01 and 0.05 got almost the same accuracy. However this also depends on how did the outliers got generated.
Even with such small set of data, 200 records, 40000 points on meshgrid need to calculate anomaly score, there are still a few algorithms in PyOD running very slow and veen could not output anything in hours:
- LSCP
- SOS
- LOCI
Other methods are all very fast, ABOD is a bit slower
Now let's observe the plot above:
- The blue color map indicates the anomaly score, lower the score, lighter the blue color will be. So as we can see, ABOD and OCSVM tend we seperate inliners and outliers more clear,because the 2 outleirs tend to have higher anomaly score in these 2 algorithms, comparing with other algorithms.
- For LOF and Feature Bagging, higher number of neighbours, higher precision the plot will have
- Also need to pay attention to the orange area in the plots. It's the area that predicted anomaly score smaller than the threshold. That's to say, when this area is larger than the cluster of inliners, it also tend include outliers.

3D Simulated Data



In [84]:

    
from pyod.utils.data import generate_data, get_outliers_inliers

outlier_fraction = 0.05
total_records = 200

# 5% outliers, better to set random_state, otherwise it's difficult to differentiate inliners and outliers in plot
X_train, y_train = generate_data(n_train=total_records,train_only=True, n_features=3, contamination=outlier_fraction,
                                 random_state = 10)
x_outliers, x_inliers = get_outliers_inliers(X_train, y_train)

n_inliers = len(x_inliers)
n_outliers = len(x_outliers)

print(n_inliers, n_outliers)



In [97]:

    
f1 = X_train[:, 0]
f2 = X_train[:, 1]
f3 = X_train[:, 2]



In [99]:

    
import plotly.plotly as py
import plotly
import pandas as pd

# !! Change the usernmae, API key to your plotly account here!
plotly.tools.set_credentials_file(username='[your plotly usernmae]', api_key='[your plotly API key]')



In [101]:

    
scatter = dict(
    mode = "markers",
    name = "y",
    type = "scatter3d",    
    x = f1, y = f2, z = f3,
    marker = dict( size=2, color="rgb(23, 190, 207)" )
)
clusters = dict(
    alphahull = 7,
    name = "y",
    opacity = 0.1,
    type = "mesh3d",    
    x = f1, y = f2, z = f3
)
layout = dict(
    title = '3d point clustering',
    scene = dict(
        xaxis = dict( zeroline=False ),
        yaxis = dict( zeroline=False ),
        zaxis = dict( zeroline=False ),
    )
)
fig = dict( data=[scatter, clusters], layout=layout )

# Use py.iplot() for IPython notebook
py.iplot(fig, filename='3d point clustering')









    



High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~wuhanhan999/0 or inside your plot.ly account where it is named '3d point clustering'






    Out[101]:



In [104]:

    
from IPython.display import Image

print('3D Simulated Data')
path="3d_simulated_clusters.png"
Image(path, width=600, height=600)









    



3D Simulated Data






    Out[104]:



In [107]:

    
# import models 
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.lscp import LSCP

random_state = 10

classifiers = {# Linear Models
               'One-class SVM (OCSVM)': OCSVM(contamination=outlier_fraction),
               'Minimum Covariance Determinant (MCD)': MCD(contamination=outlier_fraction, random_state=random_state),
               'Principal Component Analysis (PCA)': PCA(contamination=outlier_fraction, random_state=random_state),
               
               # Proximity-Based Models
               'Local Outlier Factor (LOF)': LOF(n_neighbors=40, contamination=outlier_fraction),
               'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outlier_fraction,
                                                  check_estimator=False, random_state=random_state),
               'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outlier_fraction),
               'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction),
               'Average KNN': KNN(method='mean', contamination=outlier_fraction),
    
               # Probabilistic Models
               'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction),
    
               # Outlier Ensembles
               'Feature Bagging': FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
                       contamination=outlier_fraction,
                       check_estimator=False,
                       random_state=random_state),
               'Isolation Forest': IForest(contamination=outlier_fraction,
                                random_state=random_state)
              }



In [108]:

    
for i, (clf_name, clf) in enumerate(classifiers.items()):
    print
    print(i + 1, 'fitting', clf_name)
    # fit the data and tag outliers
    clf.fit(X_train)
    scores_pred = clf.decision_function(X_train) * -1  # predicted anomaly score
    y_pred = clf.predict(X_train)  # predict whether inliner or outlier
    # threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
    threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
    n_errors = (y_pred != y_train).sum()
    print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')









    



(1, 'fitting', 'Histogram-base Outlier Detection (HBOS)')
('Number of Errors : ', 'Histogram-base Outlier Detection (HBOS)', 0, 'Percentage of Errors: ', '0.0%')

(2, 'fitting', 'Local Outlier Factor (LOF)')
('Number of Errors : ', 'Local Outlier Factor (LOF)', 0, 'Percentage of Errors: ', '0.0%')

(3, 'fitting', 'One-class SVM (OCSVM)')
('Number of Errors : ', 'One-class SVM (OCSVM)', 0, 'Percentage of Errors: ', '0.0%')

(4, 'fitting', 'Principal Component Analysis (PCA)')
('Number of Errors : ', 'Principal Component Analysis (PCA)', 0, 'Percentage of Errors: ', '0.0%')

(5, 'fitting', 'Isolation Forest')
('Number of Errors : ', 'Isolation Forest', 0, 'Percentage of Errors: ', '0.0%')

(6, 'fitting', 'Angle-based Outlier Detector (ABOD)')
('Number of Errors : ', 'Angle-based Outlier Detector (ABOD)', 0, 'Percentage of Errors: ', '0.0%')

(7, 'fitting', 'K Nearest Neighbors (KNN)')
('Number of Errors : ', 'K Nearest Neighbors (KNN)', 0, 'Percentage of Errors: ', '0.0%')

(8, 'fitting', 'Feature Bagging')
('Number of Errors : ', 'Feature Bagging', 0, 'Percentage of Errors: ', '0.0%')

(9, 'fitting', 'Minimum Covariance Determinant (MCD)')
('Number of Errors : ', 'Minimum Covariance Determinant (MCD)', 0, 'Percentage of Errors: ', '0.0%')

(10, 'fitting', 'Cluster-based Local Outlier Factor (CBLOF)')
('Number of Errors : ', 'Cluster-based Local Outlier Factor (CBLOF)', 0, 'Percentage of Errors: ', '0.0%')

(11, 'fitting', 'Average KNN')
('Number of Errors : ', 'Average KNN', 0, 'Percentage of Errors: ', '0.0%')



In [110]:

    
classifier = OCSVM(contamination=outlier_fraction)
classifier.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1  # predicted anomaly score
y_pred = clf.predict(X_train)

y_pred









    Out[110]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])



In [113]:

    
in_x, in_y, in_z, out_x, out_y, out_z = [], [], [], [], [], []

for i in range(len(y_pred)):
    if y_pred[i] == 0:
        in_x.append(f1[i])
        in_y.append(f2[i])
        in_z.append(f3[i])
    else:
        out_x.append(f1[i])
        out_y.append(f2[i])
        out_z.append(f3[i])



In [121]:

    
inliers = dict(
    mode = "markers",
    name = "inlier",
    type = "scatter3d",    
    x = in_x, y = in_y, z = in_z,
    marker = dict( size=2, color="green" )
)
outliers = dict(
    mode = "markers",
    name = "outlier",
    type = "scatter3d",    
    x = out_x, y = out_y, z = out_z,
    marker = dict( size=2, color="red" )
)

layout = dict(
    title = '3d point clustering',
    scene = dict(
        xaxis = dict( zeroline=False ),
        yaxis = dict( zeroline=False ),
        zaxis = dict( zeroline=False ),
    )
)
fig = dict( data=[inliers, outliers], layout=layout )

# Use py.iplot() for IPython notebook
py.iplot(fig, filename='3d point clustering')









    



High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~wuhanhan999/0 or inside your plot.ly account where it is named '3d point clustering'






    Out[121]:



In [122]:

    
from IPython.display import Image

print('3D Predicted Data')
path="3d_predicted_clusters.png"
Image(path, width=600, height=600)









    



3D Predicted Data






    Out[122]:

Multi-Dimensional Simulated Data

20 features, still 5% outliers among 200 records



In [2]:

    
from pyod.utils.data import generate_data, get_outliers_inliers
import pandas as pd

outlier_fraction = 0.05
total_records = 200

# 5% outliers, better to set random_state, otherwise it's difficult to differentiate inliners and outliers in plot
X_train, y_train = generate_data(n_train=total_records,train_only=True, n_features=20, contamination=outlier_fraction,
                                 random_state = 10)
x_outliers, x_inliers = get_outliers_inliers(X_train, y_train)

n_inliers = len(x_inliers)
n_outliers = len(x_outliers)

print(n_inliers, n_outliers)



In [3]:

    
from sklearn import manifold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

sc = StandardScaler()
scaled_X = sc.fit_transform(X_train)
scaled_X[7:10]









    Out[3]:





array([[ 0.09403179,  0.33481781,  0.37324944, -0.05048449,  0.13394329,
         0.3951741 ,  0.34389993,  0.15357175,  0.11891361,  0.29212325,
         0.30111893,  0.36249683,  0.18908774,  0.2972205 ,  0.08358255,
         0.47266176,  0.04844825,  0.11846864,  0.1954073 ,  0.28463239],
       [ 0.45056263,  0.07576039,  0.08726356, -0.07357153,  0.32450561,
         0.19147453,  0.13094654,  0.06072515,  0.37678493,  0.04831207,
         0.10909716,  0.03476872,  0.02092369,  0.09999799, -0.08362031,
         0.23890216,  0.08801944,  0.15276959,  0.17458333,  0.47666135],
       [ 0.32581198, -0.0776632 ,  0.3410776 ,  0.46409115,  0.0450397 ,
         0.06717681,  0.13381732,  0.1919492 ,  0.37329616, -0.18632679,
         0.30733693,  0.22900501,  0.1886759 ,  0.04959026,  0.20634519,
         0.10919938, -0.01473285,  0.11464397,  0.12030964,  0.03953956]])



In [4]:

    
# 2D visualization
n_dimensions = 2
reduced_X = manifold.TSNE(n_components=n_dimensions, learning_rate=10, random_state=10).fit_transform(X_train)



In [5]:

    
reduced_X[:,1].shape









    Out[5]:





(200,)



In [6]:

    
df_tsne = pd.DataFrame(reduced_X)
df_tsne.columns = ['f1', 'f2']
df_tsne['is_outlier'] = y_train
df_tsne.head()



In [7]:

    
# Plot the real inliners and outliers after T-SNE dimensional reduction
print('Real inlinears & outliers:')

plt.scatter(df_tsne[df_tsne['is_outlier'] == 0]['f1'], df_tsne[df_tsne['is_outlier'] == 0]['f2'], color='green', s=50, alpha=0.5)
plt.scatter(df_tsne[df_tsne['is_outlier'] == 1]['f1'], df_tsne[df_tsne['is_outlier'] == 1]['f2'], color='red', s=100, alpha=0.8)
plt.xlabel('Feature1')
plt.ylabel('Feature2') 
plt.show()









    



Real inlinears & outliers:



In [8]:

    
# import models 
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.lscp import LSCP

random_state = 10

classifiers = {# Linear Models
               'One-class SVM (OCSVM)': OCSVM(contamination=outlier_fraction),
               'Minimum Covariance Determinant (MCD)': MCD(contamination=outlier_fraction, random_state=random_state),
               'Principal Component Analysis (PCA)': PCA(contamination=outlier_fraction, random_state=random_state),
               
               # Proximity-Based Models
               'Local Outlier Factor (LOF)': LOF(n_neighbors=40, contamination=outlier_fraction),
               'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outlier_fraction,
                                                  check_estimator=False, random_state=random_state),
               'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outlier_fraction),
               'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction),
               'Average KNN': KNN(method='mean', contamination=outlier_fraction),
    
               # Probabilistic Models
               'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction),
    
               # Outlier Ensembles
               'Feature Bagging': FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
                       contamination=outlier_fraction,
                       check_estimator=False,
                       random_state=random_state),
               'Isolation Forest': IForest(contamination=outlier_fraction,
                                random_state=random_state)
              }



In [10]:

    
# create a meshgrid 
## 200 points, the ploted meshgrid has both x-axis, y-axis in [-10, 10] range
xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))
xx









    Out[10]:





array([[-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       ..., 
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ],
       [-10.        ,  -9.89949749,  -9.79899497, ...,   9.79899497,
          9.89949749,  10.        ]])



In [23]:

    
# Fit the model
plt.figure(figsize=(10, 10))
j = 0

for i, (clf_name, clf) in enumerate(classifiers.items()):
    print
    print(i + 1, 'fitting', clf_name)
    # fit the data and tag outliers
    clf.fit(reduced_X)
    scores_pred = clf.decision_function(reduced_X) * -1  # predicted anomaly score
    y_pred = clf.predict(reduced_X)  # predict whether inliner or outlier
    # threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
    threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
    n_errors = (y_pred != y_train).sum()
    print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')

    if n_errors == 0:  # only plot those all predicted right
        # plot the results
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1  # predicted anomaly score for each (x,y) point on meshgrid
        Z = Z.reshape(xx.shape)
        subplot = plt.subplot(2,2, j + 1)

        # fill blue colormap from minimum anomaly score to threshold value
        subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                             cmap=plt.cm.Blues_r)
        # draw red contour line where anomaly score is equal to threshold
        a = subplot.contour(xx, yy, Z, levels=[threshold],
                                linewidths=2, colors='red')
        # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
        subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
        # inliner white dots; outlier black dots
        b = subplot.scatter(reduced_X[:-n_outliers, 0], reduced_X[:-n_outliers, 1], c='white', s=20, edgecolor='k')
        c = subplot.scatter(reduced_X[-n_outliers:, 0], reduced_X[-n_outliers:, 1], c='black', s=20, edgecolor='k')
        subplot.axis('tight')
        subplot.legend(
                [a.collections[0], b, c],
                ['learned decision function', 'true inliers', 'true outliers'],
                prop=matplotlib.font_manager.FontProperties(size=10),
                loc='lower right')
        subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
        subplot.set_xlim((-10, 10))
        subplot.set_ylim((-10, 10))
        j += 1
plt.show()









    



(1, 'fitting', 'Histogram-base Outlier Detection (HBOS)')
('Number of Errors : ', 'Histogram-base Outlier Detection (HBOS)', 19, 'Percentage of Errors: ', '9.5%')

(2, 'fitting', 'Local Outlier Factor (LOF)')
('Number of Errors : ', 'Local Outlier Factor (LOF)', 0, 'Percentage of Errors: ', '0.0%')

(3, 'fitting', 'One-class SVM (OCSVM)')
('Number of Errors : ', 'One-class SVM (OCSVM)', 20, 'Percentage of Errors: ', '10.0%')

(4, 'fitting', 'Principal Component Analysis (PCA)')
('Number of Errors : ', 'Principal Component Analysis (PCA)', 16, 'Percentage of Errors: ', '8.0%')

(5, 'fitting', 'Isolation Forest')
('Number of Errors : ', 'Isolation Forest', 14, 'Percentage of Errors: ', '7.0%')

(6, 'fitting', 'Angle-based Outlier Detector (ABOD)')
('Number of Errors : ', 'Angle-based Outlier Detector (ABOD)', 21, 'Percentage of Errors: ', '10.5%')

(7, 'fitting', 'K Nearest Neighbors (KNN)')
('Number of Errors : ', 'K Nearest Neighbors (KNN)', 16, 'Percentage of Errors: ', '8.0%')

(8, 'fitting', 'Feature Bagging')
('Number of Errors : ', 'Feature Bagging', 0, 'Percentage of Errors: ', '0.0%')

(9, 'fitting', 'Minimum Covariance Determinant (MCD)')
('Number of Errors : ', 'Minimum Covariance Determinant (MCD)', 0, 'Percentage of Errors: ', '0.0%')

(10, 'fitting', 'Cluster-based Local Outlier Factor (CBLOF)')
('Number of Errors : ', 'Cluster-based Local Outlier Factor (CBLOF)', 0, 'Percentage of Errors: ', '0.0%')

(11, 'fitting', 'Average KNN')
('Number of Errors : ', 'Average KNN', 12, 'Percentage of Errors: ', '6.0%')



In [24]:

    
# Use otiginal 20 dimensional data to train
for i, (clf_name, clf) in enumerate(classifiers.items()):
    print
    print(i + 1, 'fitting', clf_name)
    # fit the data and tag outliers
    clf.fit(X_train)
    scores_pred = clf.decision_function(X_train) * -1  # predicted anomaly score
    y_pred = clf.predict(X_train)  # predict whether inliner or outlier
    # threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
    threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
    n_errors = (y_pred != y_train).sum()
    print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')









    



(1, 'fitting', 'Histogram-base Outlier Detection (HBOS)')
('Number of Errors : ', 'Histogram-base Outlier Detection (HBOS)', 0, 'Percentage of Errors: ', '0.0%')

(2, 'fitting', 'Local Outlier Factor (LOF)')
('Number of Errors : ', 'Local Outlier Factor (LOF)', 0, 'Percentage of Errors: ', '0.0%')

(3, 'fitting', 'One-class SVM (OCSVM)')
('Number of Errors : ', 'One-class SVM (OCSVM)', 0, 'Percentage of Errors: ', '0.0%')

(4, 'fitting', 'Principal Component Analysis (PCA)')
('Number of Errors : ', 'Principal Component Analysis (PCA)', 0, 'Percentage of Errors: ', '0.0%')

(5, 'fitting', 'Isolation Forest')
('Number of Errors : ', 'Isolation Forest', 0, 'Percentage of Errors: ', '0.0%')

(6, 'fitting', 'Angle-based Outlier Detector (ABOD)')
('Number of Errors : ', 'Angle-based Outlier Detector (ABOD)', 5, 'Percentage of Errors: ', '2.5%')

(7, 'fitting', 'K Nearest Neighbors (KNN)')
('Number of Errors : ', 'K Nearest Neighbors (KNN)', 0, 'Percentage of Errors: ', '0.0%')

(8, 'fitting', 'Feature Bagging')
('Number of Errors : ', 'Feature Bagging', 0, 'Percentage of Errors: ', '0.0%')

(9, 'fitting', 'Minimum Covariance Determinant (MCD)')
('Number of Errors : ', 'Minimum Covariance Determinant (MCD)', 0, 'Percentage of Errors: ', '0.0%')

(10, 'fitting', 'Cluster-based Local Outlier Factor (CBLOF)')
('Number of Errors : ', 'Cluster-based Local Outlier Factor (CBLOF)', 0, 'Percentage of Errors: ', '0.0%')

(11, 'fitting', 'Average KNN')
('Number of Errors : ', 'Average KNN', 0, 'Percentage of Errors: ', '0.0%')



In [26]:

    
# check how ABOD prediction plots
clf = ABOD(contamination=outlier_fraction)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1  # predicted anomaly score
y_pred = clf.predict(X_train)  # predict whether inliner or outlier

df_tsne['predicted_outlier'] = y_pred
df_tsne.head()









    Out[26]:







  
    
      
      f1
      f2
      is_outlier
      predicted_outlier
    
  
  
    
      0
      0.120654
      -0.839310
      0.0
      0
    
    
      1
      1.443098
      -1.203190
      0.0
      0
    
    
      2
      0.705386
      -5.369816
      0.0
      0
    
    
      3
      -1.955742
      -5.383917
      0.0
      0
    
    
      4
      -3.435632
      1.476585
      0.0
      0



In [28]:

    
# Plot the predicted inliners and outliers after T-SNE dimensional reduction
print('ABOD Predicted inlinears & outliers:')

plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 0]['f1'], df_tsne[df_tsne['predicted_outlier'] == 0]['f2'], color='green', s=50, alpha=0.5)
plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 1]['f1'], df_tsne[df_tsne['predicted_outlier'] == 1]['f2'], color='red', s=100, alpha=0.8)
plt.xlabel('Feature1')
plt.ylabel('Feature2') 
plt.show()









    



ABOD Predicted inlinears & outliers:



In [29]:

    
# check how ABOD prediction plots
clf = FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
                       contamination=outlier_fraction,
                       check_estimator=False,
                       random_state=random_state)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1  # predicted anomaly score
y_pred = clf.predict(X_train)  # predict whether inliner or outlier

df_tsne['predicted_outlier'] = y_pred
df_tsne.head()









    Out[29]:







  
    
      
      f1
      f2
      is_outlier
      predicted_outlier
    
  
  
    
      0
      0.120654
      -0.839310
      0.0
      0
    
    
      1
      1.443098
      -1.203190
      0.0
      0
    
    
      2
      0.705386
      -5.369816
      0.0
      0
    
    
      3
      -1.955742
      -5.383917
      0.0
      0
    
    
      4
      -3.435632
      1.476585
      0.0
      0



In [30]:

    
# Plot the predicted inliners and outliers after T-SNE dimensional reduction
print('Feature bagging Predicted inlinears & outliers:')

plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 0]['f1'], df_tsne[df_tsne['predicted_outlier'] == 0]['f2'], color='green', s=50, alpha=0.5)
plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 1]['f1'], df_tsne[df_tsne['predicted_outlier'] == 1]['f2'], color='red', s=100, alpha=0.8)
plt.xlabel('Feature1')
plt.ylabel('Feature2') 
plt.show()









    



Feature bagging Predicted inlinears & outliers:

Observations

When we have 3+ dimensions, proper amount of higher dimensions could sever for better prediction, therefore, when we are using X_train for all the classifiers here, most of them will still perform well.
T-SNE is good to be used in 2D, 3D plots, but converting a high dimensional data into 2D, 3D before doing the prediction, tend to make more classifiers have wrong predictions, majorly because of the missing info, and the data structure have been changed.

	f1	f2
0	0.120654	-0.839310
1	1.443098	-1.203190
2	0.705386	-5.369816
3	-1.955742	-5.383917
4	-3.435632	1.476585