In [1]:
%%time
import warnings
warnings.filterwarnings('ignore')
# ETL libs
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
# viz libs
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# utils libs
from IPython.core.display import display, HTML
In [2]:
%%bash
ls -l | grep csv
In [3]:
df_info = pd.read_csv('train_labels.csv')
df_info.info()
df_info.head()
Out[3]:
In [4]:
df = pd.read_csv('train_values.csv')
df.info()
df.head()
Out[4]:
In [ ]:
In [5]:
from scipy.spatial import distance
from scipy.linalg import inv
In [6]:
iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]
print(distance.mahalanobis([1, 0, 0], [0, 1, 0], iv))
print(distance.mahalanobis([0, 2, 0], [0, 1, 0], iv))
print(distance.mahalanobis([2, 0, 0], [0, 1, 0], iv))
In [7]:
x = df['oldpeak_eq_st_depression'] # serum_cholesterol_mg_per_dl
y = df['age'] #resting_blood_pressure
X = np.vstack([x,y])
V = np.cov(X.T)
vi = inv(V) # np.linalg.inv
distance.mahalanobis(x, y, vi)
In [7]:
# mahala = df[['serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'age']]
# covmx = df.cov()
# invcovmx = inv(covmx)
# mahala['mahala_dist'] = mahala.apply(
# lambda x: (
# distance.mahalanobis(
# x['serum_cholesterol_mg_per_dl'],
# x['oldpeak_eq_st_depression'],
# invcovmx)
# ), axis=1
# )
# mahala = mahala[['serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'mahala_dist']]
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [8]:
xlab = 'serum_cholesterol_mg_per_dl'
ylab = 'resting_blood_pressure'
x = df[xlab]
y = df[ylab]
trace = go.Scatter(
x = x,
y = y,
mode = 'markers',
hoverinfo = 'text',
text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)
data = [trace]
# iplot(data)
layout = go.Layout(
# showlegend = False
xaxis = dict({
'title': 'serum_cholesterol_mg_per_dl'
}),
yaxis = dict({
'title': 'resting_blood_pressure'
})
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)
In [9]:
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = clf.fit_predict(np.array(x,y).reshape(-1, 1))
X_scores = clf.negative_outlier_factor_
print(X_scores[:20])
In [10]:
plt.figure(figsize=(15, 15))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(x, y, color='k', s=3., label='Data points')
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(x, y, s=100000 * radius, edgecolors='r',
facecolors='none', label='Outlier scores')
plt.axis('tight')
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.show()
In [11]:
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
trace1 = go.Scatter(
x = x,
y = y,
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)
trace2 = go.Scatter(
x = x,
y = y,
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['radius: %s' % round(rad, 2) for rad in radius],
marker=dict(
size = 200 * radius,
)
)
data = [trace1, trace2]
layout = go.Layout(
xaxis = dict({'title': xlab}),
yaxis = dict({'title': ylab})
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [12]:
df.head()
Out[12]:
In [13]:
# fit the model
clf = IsolationForest(
# behaviour='new',
max_samples=100,
random_state=22, contamination='auto')
In [14]:
xlab = 'serum_cholesterol_mg_per_dl'
ylab = 'resting_blood_pressure'
x = df[xlab].apply(float)
y = df[ylab].apply(float)
X_train = np.array(x,y).reshape(-1, 1)
In [15]:
clf.fit(X_train)
In [16]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
In [17]:
# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
In [18]:
plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white',
s=20, edgecolor='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green',
s=20, edgecolor='k')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red',
s=20, edgecolor='k')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
["training observations",
"new regular observations", "new abnormal observations"],
loc="upper left")
plt.show()
In [ ]:
In [ ]:
In [19]:
%%bash
ls ./ | grep glass
In [20]:
df_glass = pd.read_csv(
'glass.data.txt',
header=None,
names=['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'GType']
).drop('ID', axis=1)
# http://odds.cs.stonybrook.edu/glass-data/
# > Here, class 6 is a clear minority class, as such points of class 6
# are marked as outliers, while all other points are inliers.
df_glass['GOut'] = df_glass.GType.apply(lambda x: 1 if x != 6 else 0)
gtypes = {
1: 'building_windows_float_processed',
2: 'building_windows_non_float_processed',
3: 'vehicle_windows_float_processed',
4: 'vehicle_windows_non_float_processed',
5: 'containers',
6: 'tableware',
7: 'headlamps'
}
gouts = {
0: 'outlier',
1: 'common'
}
df_glass.info()
df_glass.head()
Out[20]:
In [21]:
print(df_glass.GType.value_counts())
print(df_glass.GOut.value_counts())
In [22]:
%%time
sns.pairplot(df_glass.drop('GOut', axis=1), hue='GType')
Out[22]:
In [23]:
%%time
sns.pairplot(df_glass.drop('GType', axis=1), hue='GOut')
Out[23]:
In [29]:
name = 'RI'
y = df_glass[name]
trace1 = go.Box(
y=y,
jitter=0.3,
pointpos=-1.8,
boxpoints = 'all',
marker = dict(
color = 'rgb(8,81,156)',
),
name = name
)
trace2 = go.Box(
y=y,
jitter=0.3,
pointpos=-1.8,
boxpoints = 'suspectedoutliers',
marker = dict(
color = 'rgb(8,81,156)',
outliercolor = 'rgba(219, 64, 82, 0.6)',
line = dict(
outliercolor = 'rgba(219, 64, 82, 0.6)',
outlierwidth = 2)
),
name = name + ' - suspected outliers'
)
data = [trace1, trace2]
layout = go.Layout(
showlegend = False
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)
In [30]:
name = 'RI'
y = df_glass[name]
q1, q3 = np.percentile(y, [25, 75])
iqr = q3 - q1
uof = q3 + 3 * iqr
uif = q3 + 1.5 * iqr
lif = q1 - 1.5 * iqr
lof = q3 - 3 * iqr
print('Upper outer fence: %s' % uof)
print('Upper inner fence: %s' % uif)
print('Lower inner fence: %s' % lif)
print('Lower outer fence: %s' % lof)
In [31]:
X_train = df_glass[['RI', 'Mg']]
X_train = X_train.values.tolist()
X_train
Out[31]:
In [32]:
y_mild_outliers = df_glass[(df_glass.RI >= uif) & (df_glass.RI < uof)][['RI', 'Mg']]
y_mild_outliers = y_mild_outliers.values.tolist()
y_mild_outliers
Out[32]:
In [33]:
# y_extr_outliers = df_glass[df_glass.RI >= uof]['RI']
# y_extr_outliers = list(y_extr_outliers)
y_extr_outliers = df_glass[df_glass.RI >= uof][['RI', 'Mg']]
y_extr_outliers = y_extr_outliers.values.tolist()
y_extr_outliers
Out[33]:
In [34]:
name1 = 'RI'
name2 = 'Mg'
traces = []
for gtyp in df_glass.GOut.unique():
x = df_glass[df_glass.GOut == gtyp][name1]
y = df_glass[df_glass.GOut == gtyp][name2]
trace = go.Scatter(
x=x,
y=y,
mode='markers',
#marker = dict(
# color = 'rgb(8,81,156)',
#),
# name = gtypes[gtyp]
text = gouts[gtyp],
hoverinfo = 'text'
)
traces.append(trace)
layout = go.Layout(
hovermode= 'closest',
showlegend = False,
xaxis = dict({
'title': name1
}),
yaxis = dict({
'title': name2
})
)
fig = go.Figure(data=traces, layout=layout)
iplot(fig, layout)
In [ ]:
In [35]:
plt.figure(figsize=(10,5))
y_mild_outliers = df_glass.RI
y_pred = pd.Series(y_mild_outliers)
# inverse_lof of -1 => normal, while outliers have a smaller number
y_pred = y_pred.apply(lambda x: 0 if (x >= uif) & (x < uof) else 1)
y_true = df_glass.GOut
print('Identified outliers: %s' % y_pred.value_counts()[0])
print('True outliers: %s' % y_true.value_counts()[0])
print('---' * 6)
print('Precision: %s' % round(average_precision_score(y_true, y_pred), 3))
print('Recall: %s' % round(recall_score(y_true, y_pred), 3))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=0)
print('AUC: %s' % round(auc(fpr, tpr), 3))
precision, recall, _ = precision_recall_curve(y_true, y_pred)
average_precision = average_precision_score(y_true, y_pred)
# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
if 'step' in signature(plt.fill_between).parameters
else {})
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.05])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
average_precision))
plt.show()
In [36]:
plt.figure(figsize=(10,5))
y_mild_outliers = df_glass.RI
y_pred = pd.Series(y_mild_outliers)
# inverse_lof of -1 => normal, while outliers have a smaller number
y_pred = y_pred.apply(lambda x: 0 if x >= uof else 1)
y_true = df_glass.GOut
print('Identified outliers: %s' % y_pred.value_counts()[0])
print('True outliers: %s' % y_true.value_counts()[0])
print('---' * 6)
print('Precision: %s' % round(average_precision_score(y_true, y_pred), 3))
print('Recall: %s' % round(recall_score(y_true, y_pred), 3))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=0)
print('AUC: %s' % round(auc(fpr, tpr), 3))
precision, recall, _ = precision_recall_curve(y_true, y_pred)
average_precision = average_precision_score(y_true, y_pred)
# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
if 'step' in signature(plt.fill_between).parameters
else {})
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.05])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
average_precision))
plt.show()
In [37]:
x = df_glass['Mg']
y = df_glass['RI']
X = np.vstack([x,y])
V = np.cov(X.T)
vi = inv(V) # np.linalg.inv
distance.mahalanobis(x, y, vi)
In [38]:
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
x = df_glass[name1]
y = df_glass[name2]
y_pred = clf.fit_predict(np.array(x,y).reshape(-1, 1))
X_scores = clf.negative_outlier_factor_
print(X_scores[:20])
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
trace1 = go.Scatter(
x = x,
y = y,
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)
trace2 = go.Scatter(
x = x,
y = y,
name = 'data',
mode = 'markers',
hoverinfo = 'text',
text = ['radius: %s' % round(rad, 2) for rad in radius],
marker=dict(
size = 100 * radius,
)
)
data = [trace1, trace2]
layout = go.Layout(
xaxis = dict({'title': name1}),
yaxis = dict({'title': name2})
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)
In [39]:
plt.figure(figsize=(10,5))
y_pred = pd.Series(X_scores)
# inverse_lof of -1 => normal, while outliers have a smaller number
y_pred = y_pred.apply(lambda x: 0 if x < -2.5 else 1)
y_true = df_glass.GOut
print('Identified outliers: %s' % y_pred.value_counts()[0])
print('True outliers: %s' % y_true.value_counts()[0])
print('---' * 6)
print('Precision: %s' % round(average_precision_score(y_true, y_pred), 3))
print('Recall: %s' % round(recall_score(y_true, y_pred), 3))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=0)
print('AUC: %s' % round(auc(fpr, tpr), 3))
precision, recall, _ = precision_recall_curve(y_true, y_pred)
average_precision = average_precision_score(y_true, y_pred)
# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
if 'step' in signature(plt.fill_between).parameters
else {})
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.05])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
average_precision))
plt.show()
In [40]:
# ROC / AUC curve
plt.figure()
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
# for i in range(2):
# fpr[i], tpr[i], _ = roc_curve(y_pred[:, i], y_true[:, i])
# roc_auc[i] = auc(fpr[i], tpr[i])
fpr, tpr, _ = roc_curve(y_pred, y_true)
roc_auc = auc(fpr, tpr)
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
In [ ]:
compare outliers from every method extracted to labels in the dataset (not some other measure to identify outliers)
In [41]:
average_precision = average_precision_score(y_pred, X_scores)
average_precision
Out[41]:
In [42]:
from sklearn.preprocessing import label_binarize
Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
random_state=random_state)
from sklearn.multiclass import OneVsRestClassifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [43]:
# fit the model
clf = IsolationForest(
# behaviour='new',
max_samples=100,
random_state=22, contamination='auto')
In [44]:
clf.fit(X_train)
In [45]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
In [46]:
# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
In [47]:
plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white',
s=20, edgecolor='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green',
s=20, edgecolor='k')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red',
s=20, edgecolor='k')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
["training observations",
"new regular observations", "new abnormal observations"],
loc="upper left")
plt.show()
In [ ]:
In [ ]:
In [ ]: