In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib.font_manager
In [75]:
from pyod.utils.data import generate_data, get_outliers_inliers
outlier_fraction = 0.01
total_records = 200
# 5% outliers, better to set random_state, otherwise it's difficult to differentiate inliners and outliers in plot
X_train, y_train = generate_data(n_train=total_records,train_only=True, n_features=2, contamination=outlier_fraction,
random_state = 10)
x_outliers, x_inliers = get_outliers_inliers(X_train, y_train)
n_inliers = len(x_inliers)
n_outliers = len(x_outliers)
print(n_inliers, n_outliers)
In [76]:
print(X_train[:10])
print(y_train[:10])
print(np.unique(y_train, return_counts=True))
In [77]:
# 1 features
f1 = X_train[:,[0]]
f2 = X_train[:,[1]]
In [78]:
# The cluster at top right corner is inliners, others (10 points) are outliers
plt.scatter(f1,f2)
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()
In [79]:
# create a meshgrid
## 200 points, the ploted meshgrid has both x-axis, y-axis in [-10, 10] range
xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))
xx
Out[79]:
In [80]:
# check some data structures used below
print(xx.shape)
print(xx.ravel().shape) # ravel() method convert 200*200 matrix into a 40000 list
xx.ravel()
Out[80]:
In [81]:
# check some data structures used below
print(np.c_[xx.ravel(), yy.ravel()].shape)
np.c_[xx.ravel(), yy.ravel()]
Out[81]:
In [82]:
# import models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.lscp import LSCP
random_state = 10
classifiers = {# Linear Models
'One-class SVM (OCSVM)': OCSVM(contamination=outlier_fraction),
'Minimum Covariance Determinant (MCD)': MCD(contamination=outlier_fraction, random_state=random_state),
'Principal Component Analysis (PCA)': PCA(contamination=outlier_fraction, random_state=random_state),
# Proximity-Based Models
'Local Outlier Factor (LOF)': LOF(n_neighbors=40, contamination=outlier_fraction),
'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outlier_fraction,
check_estimator=False, random_state=random_state),
'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outlier_fraction),
'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction),
'Average KNN': KNN(method='mean', contamination=outlier_fraction),
# Probabilistic Models
'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction),
# Outlier Ensembles
'Feature Bagging': FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
contamination=outlier_fraction,
check_estimator=False,
random_state=random_state),
'Isolation Forest': IForest(contamination=outlier_fraction,
random_state=random_state)
}
In [83]:
# Fit the model
plt.figure(figsize=(20, 20))
for i, (clf_name, clf) in enumerate(classifiers.items()):
print
print(i + 1, 'fitting', clf_name)
# fit the data and tag outliers
clf.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1 # predicted anomaly score
y_pred = clf.predict(X_train) # predict whether inliner or outlier
# threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
n_errors = (y_pred != y_train).sum()
print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')
# plot the results
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 # predicted anomaly score for each (x,y) point on meshgrid
Z = Z.reshape(xx.shape)
subplot = plt.subplot(4,4, i + 1)
# fill blue colormap from minimum anomaly score to threshold value
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=plt.cm.Blues_r)
# draw red contour line where anomaly score is equal to threshold
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
# fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
# inliner white dots; outlier black dots
b = subplot.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white', s=20, edgecolor='k')
c = subplot.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black', s=20, edgecolor='k')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=10),
loc='lower right')
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-10, 10))
subplot.set_ylim((-10, 10))
plt.show()
In [84]:
from pyod.utils.data import generate_data, get_outliers_inliers
outlier_fraction = 0.05
total_records = 200
# 5% outliers, better to set random_state, otherwise it's difficult to differentiate inliners and outliers in plot
X_train, y_train = generate_data(n_train=total_records,train_only=True, n_features=3, contamination=outlier_fraction,
random_state = 10)
x_outliers, x_inliers = get_outliers_inliers(X_train, y_train)
n_inliers = len(x_inliers)
n_outliers = len(x_outliers)
print(n_inliers, n_outliers)
In [97]:
f1 = X_train[:, 0]
f2 = X_train[:, 1]
f3 = X_train[:, 2]
In [99]:
import plotly.plotly as py
import plotly
import pandas as pd
# !! Change the usernmae, API key to your plotly account here!
plotly.tools.set_credentials_file(username='[your plotly usernmae]', api_key='[your plotly API key]')
In [101]:
scatter = dict(
mode = "markers",
name = "y",
type = "scatter3d",
x = f1, y = f2, z = f3,
marker = dict( size=2, color="rgb(23, 190, 207)" )
)
clusters = dict(
alphahull = 7,
name = "y",
opacity = 0.1,
type = "mesh3d",
x = f1, y = f2, z = f3
)
layout = dict(
title = '3d point clustering',
scene = dict(
xaxis = dict( zeroline=False ),
yaxis = dict( zeroline=False ),
zaxis = dict( zeroline=False ),
)
)
fig = dict( data=[scatter, clusters], layout=layout )
# Use py.iplot() for IPython notebook
py.iplot(fig, filename='3d point clustering')
Out[101]:
In [104]:
from IPython.display import Image
print('3D Simulated Data')
path="3d_simulated_clusters.png"
Image(path, width=600, height=600)
Out[104]:
In [107]:
# import models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.lscp import LSCP
random_state = 10
classifiers = {# Linear Models
'One-class SVM (OCSVM)': OCSVM(contamination=outlier_fraction),
'Minimum Covariance Determinant (MCD)': MCD(contamination=outlier_fraction, random_state=random_state),
'Principal Component Analysis (PCA)': PCA(contamination=outlier_fraction, random_state=random_state),
# Proximity-Based Models
'Local Outlier Factor (LOF)': LOF(n_neighbors=40, contamination=outlier_fraction),
'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outlier_fraction,
check_estimator=False, random_state=random_state),
'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outlier_fraction),
'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction),
'Average KNN': KNN(method='mean', contamination=outlier_fraction),
# Probabilistic Models
'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction),
# Outlier Ensembles
'Feature Bagging': FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
contamination=outlier_fraction,
check_estimator=False,
random_state=random_state),
'Isolation Forest': IForest(contamination=outlier_fraction,
random_state=random_state)
}
In [108]:
for i, (clf_name, clf) in enumerate(classifiers.items()):
print
print(i + 1, 'fitting', clf_name)
# fit the data and tag outliers
clf.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1 # predicted anomaly score
y_pred = clf.predict(X_train) # predict whether inliner or outlier
# threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
n_errors = (y_pred != y_train).sum()
print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')
In [110]:
classifier = OCSVM(contamination=outlier_fraction)
classifier.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1 # predicted anomaly score
y_pred = clf.predict(X_train)
y_pred
Out[110]:
In [113]:
in_x, in_y, in_z, out_x, out_y, out_z = [], [], [], [], [], []
for i in range(len(y_pred)):
if y_pred[i] == 0:
in_x.append(f1[i])
in_y.append(f2[i])
in_z.append(f3[i])
else:
out_x.append(f1[i])
out_y.append(f2[i])
out_z.append(f3[i])
In [121]:
inliers = dict(
mode = "markers",
name = "inlier",
type = "scatter3d",
x = in_x, y = in_y, z = in_z,
marker = dict( size=2, color="green" )
)
outliers = dict(
mode = "markers",
name = "outlier",
type = "scatter3d",
x = out_x, y = out_y, z = out_z,
marker = dict( size=2, color="red" )
)
layout = dict(
title = '3d point clustering',
scene = dict(
xaxis = dict( zeroline=False ),
yaxis = dict( zeroline=False ),
zaxis = dict( zeroline=False ),
)
)
fig = dict( data=[inliers, outliers], layout=layout )
# Use py.iplot() for IPython notebook
py.iplot(fig, filename='3d point clustering')
Out[121]:
In [122]:
from IPython.display import Image
print('3D Predicted Data')
path="3d_predicted_clusters.png"
Image(path, width=600, height=600)
Out[122]:
In [2]:
from pyod.utils.data import generate_data, get_outliers_inliers
import pandas as pd
outlier_fraction = 0.05
total_records = 200
# 5% outliers, better to set random_state, otherwise it's difficult to differentiate inliners and outliers in plot
X_train, y_train = generate_data(n_train=total_records,train_only=True, n_features=20, contamination=outlier_fraction,
random_state = 10)
x_outliers, x_inliers = get_outliers_inliers(X_train, y_train)
n_inliers = len(x_inliers)
n_outliers = len(x_outliers)
print(n_inliers, n_outliers)
In [3]:
from sklearn import manifold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
sc = StandardScaler()
scaled_X = sc.fit_transform(X_train)
scaled_X[7:10]
Out[3]:
In [4]:
# 2D visualization
n_dimensions = 2
reduced_X = manifold.TSNE(n_components=n_dimensions, learning_rate=10, random_state=10).fit_transform(X_train)
In [5]:
reduced_X[:,1].shape
Out[5]:
In [6]:
df_tsne = pd.DataFrame(reduced_X)
df_tsne.columns = ['f1', 'f2']
df_tsne['is_outlier'] = y_train
df_tsne.head()
Out[6]:
In [7]:
# Plot the real inliners and outliers after T-SNE dimensional reduction
print('Real inlinears & outliers:')
plt.scatter(df_tsne[df_tsne['is_outlier'] == 0]['f1'], df_tsne[df_tsne['is_outlier'] == 0]['f2'], color='green', s=50, alpha=0.5)
plt.scatter(df_tsne[df_tsne['is_outlier'] == 1]['f1'], df_tsne[df_tsne['is_outlier'] == 1]['f2'], color='red', s=100, alpha=0.8)
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()
In [8]:
# import models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.lscp import LSCP
random_state = 10
classifiers = {# Linear Models
'One-class SVM (OCSVM)': OCSVM(contamination=outlier_fraction),
'Minimum Covariance Determinant (MCD)': MCD(contamination=outlier_fraction, random_state=random_state),
'Principal Component Analysis (PCA)': PCA(contamination=outlier_fraction, random_state=random_state),
# Proximity-Based Models
'Local Outlier Factor (LOF)': LOF(n_neighbors=40, contamination=outlier_fraction),
'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outlier_fraction,
check_estimator=False, random_state=random_state),
'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outlier_fraction),
'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction),
'Average KNN': KNN(method='mean', contamination=outlier_fraction),
# Probabilistic Models
'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction),
# Outlier Ensembles
'Feature Bagging': FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
contamination=outlier_fraction,
check_estimator=False,
random_state=random_state),
'Isolation Forest': IForest(contamination=outlier_fraction,
random_state=random_state)
}
In [10]:
# create a meshgrid
## 200 points, the ploted meshgrid has both x-axis, y-axis in [-10, 10] range
xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))
xx
Out[10]:
In [23]:
# Fit the model
plt.figure(figsize=(10, 10))
j = 0
for i, (clf_name, clf) in enumerate(classifiers.items()):
print
print(i + 1, 'fitting', clf_name)
# fit the data and tag outliers
clf.fit(reduced_X)
scores_pred = clf.decision_function(reduced_X) * -1 # predicted anomaly score
y_pred = clf.predict(reduced_X) # predict whether inliner or outlier
# threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
n_errors = (y_pred != y_train).sum()
print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')
if n_errors == 0: # only plot those all predicted right
# plot the results
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 # predicted anomaly score for each (x,y) point on meshgrid
Z = Z.reshape(xx.shape)
subplot = plt.subplot(2,2, j + 1)
# fill blue colormap from minimum anomaly score to threshold value
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=plt.cm.Blues_r)
# draw red contour line where anomaly score is equal to threshold
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
# fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
# inliner white dots; outlier black dots
b = subplot.scatter(reduced_X[:-n_outliers, 0], reduced_X[:-n_outliers, 1], c='white', s=20, edgecolor='k')
c = subplot.scatter(reduced_X[-n_outliers:, 0], reduced_X[-n_outliers:, 1], c='black', s=20, edgecolor='k')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=10),
loc='lower right')
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-10, 10))
subplot.set_ylim((-10, 10))
j += 1
plt.show()
In [24]:
# Use otiginal 20 dimensional data to train
for i, (clf_name, clf) in enumerate(classifiers.items()):
print
print(i + 1, 'fitting', clf_name)
# fit the data and tag outliers
clf.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1 # predicted anomaly score
y_pred = clf.predict(X_train) # predict whether inliner or outlier
# threshold is the specified percentile (100 * outlier_fraction) at a list of scores_pred
threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
n_errors = (y_pred != y_train).sum()
print('Number of Errors : ', clf_name, n_errors, 'Percentage of Errors: ', str(n_errors*100.0/total_records)+'%')
In [26]:
# check how ABOD prediction plots
clf = ABOD(contamination=outlier_fraction)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1 # predicted anomaly score
y_pred = clf.predict(X_train) # predict whether inliner or outlier
df_tsne['predicted_outlier'] = y_pred
df_tsne.head()
Out[26]:
In [28]:
# Plot the predicted inliners and outliers after T-SNE dimensional reduction
print('ABOD Predicted inlinears & outliers:')
plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 0]['f1'], df_tsne[df_tsne['predicted_outlier'] == 0]['f2'], color='green', s=50, alpha=0.5)
plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 1]['f1'], df_tsne[df_tsne['predicted_outlier'] == 1]['f2'], color='red', s=100, alpha=0.8)
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()
In [29]:
# check how ABOD prediction plots
clf = FeatureBagging(LOF(n_neighbors=40), # more neighbor, higher plot precision
contamination=outlier_fraction,
check_estimator=False,
random_state=random_state)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train) * -1 # predicted anomaly score
y_pred = clf.predict(X_train) # predict whether inliner or outlier
df_tsne['predicted_outlier'] = y_pred
df_tsne.head()
Out[29]:
In [30]:
# Plot the predicted inliners and outliers after T-SNE dimensional reduction
print('Feature bagging Predicted inlinears & outliers:')
plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 0]['f1'], df_tsne[df_tsne['predicted_outlier'] == 0]['f2'], color='green', s=50, alpha=0.5)
plt.scatter(df_tsne[df_tsne['predicted_outlier'] == 1]['f1'], df_tsne[df_tsne['predicted_outlier'] == 1]['f2'], color='red', s=100, alpha=0.8)
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()