In [96]:
"""
see:
http://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
http://r.789695.n4.nabble.com/Identifying-outliers-in-non-normally-distributed-data-td987921.html
There is a huge number of ways to test for outliers
A simple but good outlier test is to remove points based on their "median absolute deviation (MAD)".
"""
# !!! Attention !!!
# 1) We need the outlier classifier to work correctly regardless of sample - size
# the MAD-based classifier works correctly regardless of sample-size,
# 2) Identifying an observation as an outlier depends on the underlying distribution of the data.
# here we limit the discussion to univariate data sets that are assumed to follow an approximately normal distribution.
# If the normality assumption for the data being tested is not valid, then a determination that there is an outlier may
# in fact be due to the non-normality of the data rather than the prescence of an outlier.
# It is recommented that the modified Z-scores with an absolute value of greater than 3.5 be labeled as outliers
# imports
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [1]:
# Some outlier methods...
def mad_based_outlier(points, thresh=3.5):
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return points[modified_z_score > thresh]
def percentile_based_outlier(data, threshold=95):
diff = (100 - threshold) / 2.0
minval, maxval = np.percentile(data, [diff, 100 - diff])
return data[(data < minval) | (data > maxval)]
def tukey_based_outlier(data, m=1.5):
# :param data: dataframe of single column of interest
#There doesn’t seem to be any statistically-driven reason Tukey uses 1.5 as a hard basis for his method.
#In fact, if we want to be more conservative, you could use 3 x IQR to identify the “extreme” outliers.”
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
low_lim = Q1 - m * IQR
high_lim = Q3 + m * IQR
return data[(data < low_lim) | (data > high_lim)]
In [98]:
def find_outliers_and_plot(x):
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.distplot(x, ax=ax, rug=True, hist=False)
outliers = mad_based_outlier(x)
#outliers = tukey_based_outlier(x)
#outliers = percentile_based_outlier(x)
ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
kwargs = dict(y=0.95, x=0.05, ha='left', va='top')
ax.set_title('MAD-based Outliers', **kwargs)
fig.suptitle('Comparing Outlier Tests with n={}'.format(len(x)), size=14)
def normal_and_outliers(n_events):
# Generate some data
x = np.random.normal(0, 0.5, n_events-3)
# And add three outliers...
x = np.r_[x, -3, -10, 12]
return x
def find_outliers():
# we do the test for different size to check that this does not affect the result
for n_events in [10, 100, 1000]:
x = normal_and_outliers(n_events)
find_outliers_and_plot(x)
plt.show()
find_outliers()
In [99]:
# But in many cases we do not have symmetrical and normal distributions, e.g. a landau with a big tail and outliers after that tail
# If we use the MAD version, we ll get just too many outliers in the tail - which do not seem to be
# A good candidate is Tukey method because
# 1) it is simple, just leveraging the Interquartile Range
# 2) it is applicable to most ranges since it isn’t dependent on distributional assumptions
# 3) it ignores the mean and standard deviation, making it resistant to being influenced by the extreme values in the range
# !!! Attention !!!
# On this task there are written many books and many libraries (see robust lib of R)
# There is not any super robust algorith that automatically detects outliers for anything we feed!
# Even if we would assume normal distribution, declaring points as outliers is a bit of tricky/fraud
# This is because we need a good _theoretical_ reason for making the decision => Such judgement is imposible to codify in an algorithm
import pandas as pd
def plot_it(x):
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.distplot(x, ax=ax, rug=True, hist=False)
#outliers = x[mad_based_outlier(x)]
outliers = tukey_based_outlier(x, 6)
ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
kwargs = dict(y=0.95, x=0.05, ha='left', va='top')
# read real data distribution
df = pd.read_csv('small_data_samples/landau_approx_and_outliers.csv', header=None)
x = df[0].values
plot_it(x)
In [ ]: