In [1]:
import pandas as pd
from pandas import DataFrame as DF, Series
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
This data contains two metrics for a particular server. Anomalous behavior can be an indication of various types of problems. The goal is to create a system that can handle a live feed of data to detect anomalies.
In [2]:
# read data
data_tr = pd.read_csv('multivariate_tr.csv')
data = pd.read_csv('multivariate_ts.csv')
In [3]:
data_tr.describe()
Out[3]:
In [4]:
from pandas.plotting import scatter_matrix
scatter_matrix(data_tr, hist_kwds={'bins':25}, figsize=(10,7));
In [5]:
data_tr.rolling(window=1000).mean().dropna().plot(figsize=(10,5));
In [6]:
# estimate and return multivariate gaussian distribution
def multi_gaus_dist(window):
means = window.mean()
cov = window.cov()
dist = st.multivariate_normal(means, cov)
return dist
# check if obs is outlier
def isoutlier(obs, dist, thresh=0.0001):
p = dist.pdf(obs)
if p < thresh:
return True
else:
return False
In [7]:
dist = multi_gaus_dist(data_tr)
sum([isoutlier(point[1:], dist) for point in data.itertuples()])
Out[7]:
In [8]:
combined_df = pd.concat([data_tr.iloc[-1000:], data], axis=0)
In [9]:
anomaly = []
for i in range(1000, len(combined_df)):
window = combined_df.iloc[i-1000:i]
dist = multi_gaus_dist(window)
obs = combined_df.iloc[i]
anomaly.append(isoutlier(obs, dist, thresh=0.00001))
In [10]:
outliers = data[anomaly]
inliers = data[~np.array(anomaly)]
In [11]:
outliers.shape, inliers.shape
Out[11]:
In [12]:
mu = data.mean()
cov = data.cov()
mvd = st.multivariate_normal(mu, cov)
mesh = np.mgrid[data.X1.min()-1:data.X1.max()+1:.01,
data.X2.min()-1:data.X2.max()+1:.01]
pos = np.empty(mesh[0].shape + (2,))
pos[:, :, 0] = mesh[0]
pos[:, :, 1] = mesh[1]
plt.figure(figsize=(14,10))
ax = plt.contour(mesh[0], mesh[1], mvd.pdf(pos), 10, cmap='bone')
plt.clabel(ax, inline=1, fontsize=12)
temp = data.copy()
temp['anomaly'] = anomaly
temp.loc[:, 'anomaly'] = temp.anomaly.map({True:'red', False:'#cccccc'})
plt.scatter(temp['X1'], temp['X2'], c=temp['anomaly'], alpha=0.5)
plt.xlabel('X1')
plt.ylabel('X2');
One way to optimize the threshold would be to use historical data to label observations that, in hindsight, qualify as anomalies. With these labels you can use CV to tune the threshold until you get the best value on some metric (e.g. F1-score).
This dataset contains chronological data for a computer system. The data simply indicates if an error ocurred at a given time with a 1, otherwise a 0 is output. This system has a fixed expectation value of 5 errors per thousand. The goal is to detect if there were an abnormal number of errors for any given time-window.
In [13]:
data = np.loadtxt('errors.csv')
data
Out[13]:
Poisson Distribution.
In [14]:
1e3*data.mean(), data.reshape(1000,-1).sum(axis=0).mean()
Out[14]:
In [15]:
anomalies = []
fullset = []
thresh = 0.01
mu = 1e3*data.mean()
rv = st.poisson(mu=mu)
n = 0
for i,v in enumerate(data):
if i%1e3 == 0:
p = rv.pmf(n)
fullset.append((i//1e3, n))
if p < thresh:
anomalies.append((i//1e3, n, p))
n = 0
if v == 1:
n += 1
print('Stream Complete')
In [16]:
anomalies = DF(anomalies, columns=['block','n_errors','p-value']).astype({'block': int})
fullset = DF(fullset, columns=['block','n_errors']).astype({'block': int})
In [17]:
anomalies
Out[17]:
In [18]:
fullset.head()
Out[18]:
In [19]:
# fullset.n_errors.plot(figsize=(16,8), style='o')
outliers = fullset[fullset.block.isin(anomalies.block)].n_errors
inliers = fullset[~fullset.index.isin(anomalies.block)].n_errors
outliers.plot(figsize=(16,8), style='ro')
inliers.plot(figsize=(16,8), style='x');
This is a very common dataset (slightly modified) of three varieties of the iris flower; it will work well for clustering.
In [20]:
# read data
train = pd.read_csv('iris_train.csv')
test = pd.read_csv('iris_test.csv')
# import k-means
from sklearn.cluster import KMeans as KM
In [21]:
Xtr = train.iloc[:, :-1]
ytr = train.iloc[:, -1]
Xts = test.iloc[:, :-1]
yts = test.iloc[:, -1]
In [22]:
ytr.value_counts()
Out[22]:
In [23]:
km = KM(n_clusters=3)
km.fit(Xtr)
Out[23]:
In [24]:
# get 98th percentile for distances from each center
# transform converts the observations to distance space
quants = DF(km.transform(Xtr)).quantile(0.98)
quants
Out[24]:
In [25]:
# get the distances of the test data from centers
distances_ts = DF(km.transform(Xts))
In [26]:
# outliers are those where distance from all centers is greater than respective percentiles
outlier = (distances_ts > quants).all(axis=1)
outliers = Xts[outlier]
inliers = Xts[~Xts.index.isin(outliers.index)]
In [27]:
outliers
Out[27]:
In [28]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(14, 8))
# create 3d axis object to use and set rotation for good view
ax = Axes3D(fig, elev=-150, azim=3)
ax.scatter(outliers['1'], outliers['2'], outliers['3'], c='r')
# map label numbers to color values
colors = yts[yts.index.isin(inliers.index)].map({0:'b', 1:'g', 2:'k'})
ax.scatter(inliers['1'], inliers['2'], inliers['3'], c=colors);