Lab 3

README

https://stackoverflow.com/questions/28558540/precision-and-recall-for-outlier-detection

https://classeval.wordpress.com/introduction/introduction-to-the-precision-recall-plot/

https://stackoverflow.com/questions/46663013/what-is-y-true-and-y-pred-when-creating-a-custom-metric-in-keras

Imports

Import deps



In [1]:

    
%%time

import warnings
warnings.filterwarnings('ignore')

# ETL libs
import numpy as np

import pandas as pd

from sklearn.manifold import TSNE
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc


from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature

# viz libs
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# utils libs
from IPython.core.display import display, HTML









    











    



CPU times: user 1.43 s, sys: 449 ms, total: 1.88 s
Wall time: 8.46 s

Import data



In [2]:

    
%%bash
ls -l | grep csv









    



-rw-rw-r--  1 1000 1000    1023 Feb 25 14:35 submission_format.csv
-rw-rw-r--  1 1000 1000    4864 Feb 25 14:35 test_values.csv
-rw-rw-r--  1 1000 1000    1653 Feb 25 14:35 train_labels.csv
-rw-rw-r--  1 1000 1000    9591 Feb 25 14:35 train_values.csv



In [3]:

    
df_info = pd.read_csv('train_labels.csv')

df_info.info()
df_info.head()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 2 columns):
patient_id               180 non-null object
heart_disease_present    180 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.9+ KB






    Out[3]:







  
    
      
      patient_id
      heart_disease_present
    
  
  
    
      0
      0z64un
      0
    
    
      1
      ryoo3j
      0
    
    
      2
      yt1s1x
      1
    
    
      3
      l2xjde
      1
    
    
      4
      oyt4ek
      0



In [4]:

    
df = pd.read_csv('train_values.csv')

df.info()
df.head()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null int64
thal                                    180 non-null object
resting_blood_pressure                  180 non-null int64
chest_pain_type                         180 non-null int64
num_major_vessels                       180 non-null int64
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null int64
resting_ekg_results                     180 non-null int64
serum_cholesterol_mg_per_dl             180 non-null int64
oldpeak_eq_st_depression                180 non-null float64
sex                                     180 non-null int64
age                                     180 non-null int64
max_heart_rate_achieved                 180 non-null int64
exercise_induced_angina                 180 non-null int64
dtypes: float64(1), int64(11), object(2)
memory usage: 19.8+ KB






    Out[4]:







  
    
      
      patient_id
      slope_of_peak_exercise_st_segment
      thal
      resting_blood_pressure
      chest_pain_type
      num_major_vessels
      fasting_blood_sugar_gt_120_mg_per_dl
      resting_ekg_results
      serum_cholesterol_mg_per_dl
      oldpeak_eq_st_depression
      sex
      age
      max_heart_rate_achieved
      exercise_induced_angina
    
  
  
    
      0
      0z64un
      1
      normal
      128
      2
      0
      0
      2
      308
      0.0
      1
      45
      170
      0
    
    
      1
      ryoo3j
      2
      normal
      110
      3
      0
      0
      0
      214
      1.6
      0
      54
      158
      0
    
    
      2
      yt1s1x
      1
      normal
      125
      4
      3
      0
      2
      304
      0.0
      1
      77
      162
      1
    
    
      3
      l2xjde
      1
      reversible_defect
      152
      4
      0
      0
      0
      223
      0.0
      1
      40
      181
      0
    
    
      4
      oyt4ek
      3
      reversible_defect
      178
      1
      0
      0
      2
      270
      4.2
      1
      59
      145
      0



In [ ]:

...

Mahalanobis

https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.mahalanobis.html



In [5]:

    
from scipy.spatial import distance
from scipy.linalg import inv



In [6]:

    
iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]

print(distance.mahalanobis([1, 0, 0], [0, 1, 0], iv))
print(distance.mahalanobis([0, 2, 0], [0, 1, 0], iv))
print(distance.mahalanobis([2, 0, 0], [0, 1, 0], iv))









    



1.0
1.0
1.7320508075688772



In [7]:

    
x = df['oldpeak_eq_st_depression'] # serum_cholesterol_mg_per_dl
y = df['age'] #resting_blood_pressure

X = np.vstack([x,y])
V = np.cov(X.T)
vi = inv(V) # np.linalg.inv

distance.mahalanobis(x, y, vi)









    



---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
<ipython-input-7-c401cb59bea0> in <module>()
      4 X = np.vstack([x,y])
      5 V = np.cov(X.T)
----> 6 vi = inv(V) # np.linalg.inv
      7 
      8 distance.mahalanobis(x, y, vi)

/usr/local/lib/python3.5/dist-packages/scipy/linalg/basic.py in inv(a, overwrite_a, check_finite)
    817         inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1)
    818     if info > 0:
--> 819         raise LinAlgError("singular matrix")
    820     if info < 0:
    821         raise ValueError('illegal value in %d-th argument of internal '

LinAlgError: singular matrix



In [7]:

    
# mahala = df[['serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'age']]

# covmx = df.cov()
# invcovmx = inv(covmx)

# mahala['mahala_dist'] = mahala.apply(
#     lambda x: (
#         distance.mahalanobis(
#             x['serum_cholesterol_mg_per_dl'],
#             x['oldpeak_eq_st_depression'],
#             invcovmx)
#     ), axis=1
# )

# mahala = mahala[['serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'mahala_dist']]



In [ ]:



In [ ]:



In [ ]:



In [ ]:

LOF

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html



In [8]:

    
xlab = 'serum_cholesterol_mg_per_dl'
ylab = 'resting_blood_pressure'

x = df[xlab]
y = df[ylab]

trace = go.Scatter(
    x = x,
    y = y,
    mode = 'markers',
    hoverinfo = 'text',
    text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)
data = [trace]
# iplot(data)


layout = go.Layout(
    # showlegend = False
    xaxis = dict({
        'title': 'serum_cholesterol_mg_per_dl'
    }),
    yaxis = dict({
        'title': 'resting_blood_pressure'
    })
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)



In [9]:

    
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

y_pred = clf.fit_predict(np.array(x,y).reshape(-1, 1))
X_scores = clf.negative_outlier_factor_

print(X_scores[:20])









    



[-0.97274687 -1.00644579 -0.97830072 -0.99393428 -0.98584037 -1.44135918
 -1.00595572 -1.04950192 -1.47662666 -1.03400777 -0.97936442 -0.98584917
 -1.82088952 -1.01034246 -0.99992965 -1.00738654 -0.9653431  -0.99352855
 -0.97944276 -1.25288391]

https://scikit-learn.org/0.19/auto_examples/neighbors/plot_lof.html



In [10]:

    
plt.figure(figsize=(15, 15))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(x, y, color='k', s=3., label='Data points')

radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())

plt.scatter(x, y, s=100000 * radius, edgecolors='r',
            facecolors='none', label='Outlier scores')

plt.axis('tight')
plt.xlabel(xlab)
plt.ylabel(ylab)

plt.show()



In [11]:

    
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())


trace1 = go.Scatter(
    x = x,
    y = y,
    name = 'data',
    mode = 'markers',
    hoverinfo = 'text',
    text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)

trace2 = go.Scatter(
    x = x,
    y = y,
    name = 'data',
    mode = 'markers',
    hoverinfo = 'text',
    text = ['radius: %s' % round(rad, 2) for rad in radius],
    marker=dict(
        size = 200 * radius,
    )
)
data = [trace1, trace2]


layout = go.Layout(
    xaxis = dict({'title': xlab}),
    yaxis = dict({'title': ylab})
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Projection methods

https://scikit-learn.org/stable/modules/unsupervised_reduction.html



In [ ]:



In [ ]:

Autoencoders

https://medium.com/@curiousily/credit-card-fraud-detection-using-autoencoders-in-keras-tensorflow-for-hackers-part-vii-20e0c85301bd



In [ ]:



In [ ]:

Decision Trees



In [12]:

    
df.head()









    Out[12]:







  
    
      
      patient_id
      slope_of_peak_exercise_st_segment
      thal
      resting_blood_pressure
      chest_pain_type
      num_major_vessels
      fasting_blood_sugar_gt_120_mg_per_dl
      resting_ekg_results
      serum_cholesterol_mg_per_dl
      oldpeak_eq_st_depression
      sex
      age
      max_heart_rate_achieved
      exercise_induced_angina
    
  
  
    
      0
      0z64un
      1
      normal
      128
      2
      0
      0
      2
      308
      0.0
      1
      45
      170
      0
    
    
      1
      ryoo3j
      2
      normal
      110
      3
      0
      0
      0
      214
      1.6
      0
      54
      158
      0
    
    
      2
      yt1s1x
      1
      normal
      125
      4
      3
      0
      2
      304
      0.0
      1
      77
      162
      1
    
    
      3
      l2xjde
      1
      reversible_defect
      152
      4
      0
      0
      0
      223
      0.0
      1
      40
      181
      0
    
    
      4
      oyt4ek
      3
      reversible_defect
      178
      1
      0
      0
      2
      270
      4.2
      1
      59
      145
      0



In [13]:

    
# fit the model
clf = IsolationForest(
#     behaviour='new',
    max_samples=100,
    random_state=22, contamination='auto')

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html



In [14]:

    
xlab = 'serum_cholesterol_mg_per_dl'
ylab = 'resting_blood_pressure'

x = df[xlab].apply(float)
y = df[ylab].apply(float)

X_train = np.array(x,y).reshape(-1, 1)



In [15]:

    
clf.fit(X_train)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-15-37d7e92c06aa> in <module>()
----> 1 clf.fit(X_train)

/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/iforest.py in fit(self, X, y, sample_weight)
    202 
    203         self.threshold_ = -sp.stats.scoreatpercentile(
--> 204             -self.decision_function(X), 100. * (1. - self.contamination))
    205 
    206         return self

TypeError: unsupported operand type(s) for -: 'float' and 'str'



In [16]:

    
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-16-a9a81f94b818> in <module>()
----> 1 y_pred_train = clf.predict(X_train)
      2 y_pred_test = clf.predict(X_test)
      3 y_pred_outliers = clf.predict(X_outliers)

/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/iforest.py in predict(self, X)
    224         X = check_array(X, accept_sparse='csr')
    225         is_inlier = np.ones(X.shape[0], dtype=int)
--> 226         is_inlier[self.decision_function(X) <= self.threshold_] = -1
    227         return is_inlier
    228 

AttributeError: 'IsolationForest' object has no attribute 'threshold_'



In [17]:

    
# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)



In [18]:

    
plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white',
                 s=20, edgecolor='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green',
                 s=20, edgecolor='k')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red',
                s=20, edgecolor='k')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
           ["training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper left")
plt.show()









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-18-689dcd68ed74> in <module>()
      2 plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
      3 
----> 4 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white',
      5                  s=20, edgecolor='k')
      6 b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green',

IndexError: index 1 is out of bounds for axis 1 with size 1



In [ ]:



In [ ]:

Glass Dataset

Data information:

Import data



In [19]:

    
%%bash
ls ./ | grep glass









    



glass.data.txt



In [20]:

    
df_glass = pd.read_csv(
    'glass.data.txt',
    header=None,
    names=['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'GType']
).drop('ID', axis=1)

# http://odds.cs.stonybrook.edu/glass-data/
# > Here, class 6 is a clear minority class, as such points of class 6
# are marked as outliers, while all other points are inliers.
df_glass['GOut'] = df_glass.GType.apply(lambda x: 1 if x != 6 else 0)

gtypes = {
    1: 'building_windows_float_processed',
    2: 'building_windows_non_float_processed',
    3: 'vehicle_windows_float_processed',
    4: 'vehicle_windows_non_float_processed',
    5: 'containers',
    6: 'tableware',
    7: 'headlamps'
}

gouts = {
    0: 'outlier',
    1: 'common'
}

df_glass.info()
df_glass.head()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 11 columns):
RI       214 non-null float64
Na       214 non-null float64
Mg       214 non-null float64
Al       214 non-null float64
Si       214 non-null float64
K        214 non-null float64
Ca       214 non-null float64
Ba       214 non-null float64
Fe       214 non-null float64
GType    214 non-null int64
GOut     214 non-null int64
dtypes: float64(9), int64(2)
memory usage: 18.5 KB






    Out[20]:







  
    
      
      RI
      Na
      Mg
      Al
      Si
      K
      Ca
      Ba
      Fe
      GType
      GOut
    
  
  
    
      0
      1.52101
      13.64
      4.49
      1.10
      71.78
      0.06
      8.75
      0.0
      0.0
      1
      1
    
    
      1
      1.51761
      13.89
      3.60
      1.36
      72.73
      0.48
      7.83
      0.0
      0.0
      1
      1
    
    
      2
      1.51618
      13.53
      3.55
      1.54
      72.99
      0.39
      7.78
      0.0
      0.0
      1
      1
    
    
      3
      1.51766
      13.21
      3.69
      1.29
      72.61
      0.57
      8.22
      0.0
      0.0
      1
      1
    
    
      4
      1.51742
      13.27
      3.62
      1.24
      73.08
      0.55
      8.07
      0.0
      0.0
      1
      1



In [21]:

    
print(df_glass.GType.value_counts())
print(df_glass.GOut.value_counts())









    



2    76
1    70
7    29
3    17
5    13
6     9
Name: GType, dtype: int64
1    205
0      9
Name: GOut, dtype: int64

Pairplots by class



In [22]:

    
%%time
sns.pairplot(df_glass.drop('GOut', axis=1), hue='GType')









    



CPU times: user 14.6 s, sys: 8.92 s, total: 23.5 s
Wall time: 12.5 s






    Out[22]:





<seaborn.axisgrid.PairGrid at 0x7fb8a27308d0>



In [23]:

    
%%time
sns.pairplot(df_glass.drop('GType', axis=1), hue='GOut')









    



CPU times: user 11.4 s, sys: 6.25 s, total: 17.6 s
Wall time: 9.65 s






    Out[23]:





<seaborn.axisgrid.PairGrid at 0x7fb89c39a438>

UIF / UOF - 1.5 * IQR rule



In [29]:

    
name = 'RI'
y = df_glass[name]

trace1 = go.Box(
    y=y,
    jitter=0.3,
    pointpos=-1.8,
    boxpoints = 'all',
    marker = dict(
        color = 'rgb(8,81,156)',
    ),
    name = name
)

trace2 = go.Box(
    y=y,
    jitter=0.3,
    pointpos=-1.8,
    boxpoints = 'suspectedoutliers',
    marker = dict(
        color = 'rgb(8,81,156)',
        outliercolor = 'rgba(219, 64, 82, 0.6)',
        line = dict(
            outliercolor = 'rgba(219, 64, 82, 0.6)',
            outlierwidth = 2)
    ),
    name = name + ' - suspected outliers'
)

data = [trace1, trace2]

layout = go.Layout(
    showlegend = False
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)



In [30]:

    
name = 'RI'
y = df_glass[name]

q1, q3 = np.percentile(y, [25, 75])
iqr = q3 - q1

uof = q3 + 3   * iqr
uif = q3 + 1.5 * iqr
lif = q1 - 1.5 * iqr
lof = q3 - 3   * iqr

print('Upper outer fence: %s' % uof)
print('Upper inner fence: %s' % uif)
print('Lower inner fence: %s' % lif)
print('Lower outer fence: %s' % lof)









    



Upper outer fence: 1.527062499999999
Upper inner fence: 1.5231099999999995
Lower inner fence: 1.5125700000000006
Lower outer fence: 1.5112525000000008



In [31]:

    
X_train = df_glass[['RI', 'Mg']]
X_train = X_train.values.tolist()
X_train









    Out[31]:





[[1.52101, 4.49],
 [1.5176100000000001, 3.6],
 [1.5161799999999999, 3.55],
 [1.51766, 3.69],
 [1.51742, 3.62],
 [1.51596, 3.61],
 [1.5174299999999998, 3.6],
 [1.51756, 3.61],
 [1.51918, 3.58],
 [1.51755, 3.6],
 [1.5157100000000001, 3.46],
 [1.51763, 3.66],
 [1.51589, 3.43],
 [1.51748, 3.56],
 [1.51763, 3.59],
 [1.5176100000000001, 3.54],
 [1.5178399999999999, 3.67],
 [1.52196, 3.85],
 [1.51911, 3.73],
 [1.51735, 3.54],
 [1.5175, 3.55],
 [1.51966, 3.75],
 [1.51736, 3.62],
 [1.5175100000000001, 3.57],
 [1.5172, 3.5],
 [1.5176399999999999, 3.54],
 [1.51793, 3.48],
 [1.51721, 3.48],
 [1.51768, 3.52],
 [1.5178399999999999, 3.49],
 [1.51768, 3.56],
 [1.51747, 3.5],
 [1.51775, 3.48],
 [1.5175299999999998, 3.47],
 [1.51783, 3.54],
 [1.51567, 3.45],
 [1.51909, 3.53],
 [1.51797, 3.48],
 [1.52213, 3.82],
 [1.52213, 3.82],
 [1.51793, 3.5],
 [1.51755, 3.42],
 [1.51779, 3.39],
 [1.5221, 3.84],
 [1.51786, 3.43],
 [1.519, 3.48],
 [1.5186899999999999, 3.37],
 [1.52667, 3.7],
 [1.52223, 3.77],
 [1.51898, 3.35],
 [1.5232, 3.72],
 [1.51926, 3.33],
 [1.5180799999999999, 2.87],
 [1.51837, 2.84],
 [1.51778, 2.81],
 [1.51769, 2.71],
 [1.51215, 3.47],
 [1.51824, 3.48],
 [1.51754, 3.74],
 [1.51754, 3.66],
 [1.51905, 3.62],
 [1.51977, 3.58],
 [1.5217200000000002, 3.86],
 [1.52227, 3.81],
 [1.5217200000000002, 3.74],
 [1.52099, 3.59],
 [1.52152, 3.65],
 [1.52152, 3.65],
 [1.52152, 3.58],
 [1.523, 3.58],
 [1.5157399999999999, 3.67],
 [1.5184799999999998, 3.87],
 [1.51593, 3.59],
 [1.51631, 3.57],
 [1.51596, 3.56],
 [1.5159, 3.58],
 [1.51645, 3.61],
 [1.51627, 3.58],
 [1.51613, 3.52],
 [1.5159, 3.52],
 [1.5159200000000002, 3.52],
 [1.51593, 3.45],
 [1.5164600000000001, 3.55],
 [1.51594, 3.52],
 [1.51409, 3.09],
 [1.51625, 3.58],
 [1.51569, 3.49],
 [1.51645, 3.49],
 [1.5161799999999999, 3.5],
 [1.5164, 3.48],
 [1.51841, 3.74],
 [1.51605, 3.44],
 [1.51588, 3.41],
 [1.5159, 3.34],
 [1.51629, 3.33],
 [1.5186, 3.43],
 [1.51841, 3.62],
 [1.5174299999999998, 3.25],
 [1.51689, 2.88],
 [1.51811, 2.96],
 [1.51655, 2.85],
 [1.5173, 2.72],
 [1.5182, 2.76],
 [1.52725, 3.15],
 [1.5241, 2.9],
 [1.52475, 0.0],
 [1.53125, 0.0],
 [1.53393, 0.0],
 [1.5222200000000001, 0.0],
 [1.5181799999999999, 0.0],
 [1.52664, 0.0],
 [1.52739, 0.0],
 [1.52777, 0.0],
 [1.51892, 3.83],
 [1.51847, 3.97],
 [1.5184600000000001, 3.89],
 [1.51829, 3.9],
 [1.51708, 3.68],
 [1.51673, 3.64],
 [1.51652, 3.57],
 [1.51844, 3.76],
 [1.51663, 3.54],
 [1.5168700000000002, 3.54],
 [1.5170700000000001, 3.48],
 [1.52177, 3.68],
 [1.51872, 3.66],
 [1.51667, 3.61],
 [1.52081, 2.28],
 [1.52068, 2.09],
 [1.5202, 1.35],
 [1.52177, 1.01],
 [1.52614, 0.0],
 [1.51813, 3.98],
 [1.518, 3.93],
 [1.51811, 3.85],
 [1.51789, 3.9],
 [1.51806, 3.8],
 [1.51711, 3.62],
 [1.51674, 3.52],
 [1.51674, 3.56],
 [1.5169, 3.54],
 [1.51851, 3.63],
 [1.51662, 3.51],
 [1.51709, 3.47],
 [1.5166, 3.18],
 [1.51839, 3.67],
 [1.51769, 3.66],
 [1.5161, 3.53],
 [1.5167, 3.57],
 [1.51643, 3.52],
 [1.51665, 3.45],
 [1.5212700000000001, 3.9],
 [1.51779, 3.65],
 [1.5161, 3.4],
 [1.51694, 3.58],
 [1.5164600000000001, 3.4],
 [1.51655, 3.39],
 [1.52121, 3.76],
 [1.51776, 3.41],
 [1.51796, 3.36],
 [1.5183200000000001, 3.34],
 [1.51934, 3.54],
 [1.52211, 3.78],
 [1.51514, 2.68],
 [1.51915, 1.85],
 [1.5217100000000001, 1.88],
 [1.5215100000000001, 1.71],
 [1.51969, 0.0],
 [1.5166600000000001, 0.0],
 [1.51994, 0.0],
 [1.52369, 0.0],
 [1.51316, 0.0],
 [1.51321, 0.0],
 [1.52043, 0.0],
 [1.5205799999999998, 1.61],
 [1.52119, 0.33],
 [1.51905, 2.39],
 [1.51937, 2.41],
 [1.51829, 2.24],
 [1.51852, 2.19],
 [1.51299, 1.74],
 [1.51888, 0.78],
 [1.51916, 0.0],
 [1.51969, 0.0],
 [1.51115, 0.0],
 [1.51131, 3.2],
 [1.5183799999999998, 3.26],
 [1.52315, 3.34],
 [1.52247, 2.2],
 [1.52365, 1.83],
 [1.51613, 1.78],
 [1.5160200000000001, 0.0],
 [1.51623, 0.0],
 [1.51719, 0.0],
 [1.51683, 0.0],
 [1.51545, 0.0],
 [1.51556, 0.0],
 [1.5172700000000001, 0.0],
 [1.51531, 0.0],
 [1.51609, 0.0],
 [1.51508, 0.0],
 [1.51653, 0.0],
 [1.51514, 0.0],
 [1.5165799999999998, 0.0],
 [1.51617, 0.0],
 [1.51732, 0.0],
 [1.51645, 0.0],
 [1.51831, 0.0],
 [1.5164, 0.0],
 [1.51623, 0.0],
 [1.51685, 0.0],
 [1.52065, 0.0],
 [1.51651, 0.0],
 [1.51711, 0.0]]



In [32]:

    
y_mild_outliers = df_glass[(df_glass.RI >= uif) & (df_glass.RI < uof)][['RI', 'Mg']]
y_mild_outliers = y_mild_outliers.values.tolist()
y_mild_outliers









    Out[32]:





[[1.52667, 3.7],
 [1.5232, 3.72],
 [1.5241, 2.9],
 [1.52475, 0.0],
 [1.52664, 0.0],
 [1.52614, 0.0],
 [1.52369, 0.0],
 [1.52315, 3.34],
 [1.52365, 1.83]]



In [33]:

    
# y_extr_outliers = df_glass[df_glass.RI >= uof]['RI']
# y_extr_outliers = list(y_extr_outliers)
y_extr_outliers = df_glass[df_glass.RI >= uof][['RI', 'Mg']]
y_extr_outliers = y_extr_outliers.values.tolist()
y_extr_outliers









    Out[33]:





[[1.52725, 3.15],
 [1.53125, 0.0],
 [1.53393, 0.0],
 [1.52739, 0.0],
 [1.52777, 0.0]]



In [34]:

    
name1 = 'RI'
name2 = 'Mg'

traces = []

for gtyp in df_glass.GOut.unique():
    x = df_glass[df_glass.GOut == gtyp][name1]
    y = df_glass[df_glass.GOut == gtyp][name2]

    trace = go.Scatter(
        x=x,
        y=y,
        mode='markers',
        #marker = dict(
        #    color = 'rgb(8,81,156)',
        #),
        # name = gtypes[gtyp]
        text = gouts[gtyp],
        hoverinfo = 'text'
    )
    traces.append(trace)

layout = go.Layout(
    hovermode= 'closest',
    showlegend = False,
    xaxis = dict({
        'title': name1
    }),
    yaxis = dict({
        'title': name2
    })
)

fig = go.Figure(data=traces, layout=layout)
iplot(fig, layout)



In [ ]:

UIF - Precision / Recall



In [35]:

    
plt.figure(figsize=(10,5))

y_mild_outliers = df_glass.RI

y_pred = pd.Series(y_mild_outliers)
# inverse_lof of -1 => normal, while outliers have a smaller number
y_pred = y_pred.apply(lambda x: 0 if (x >= uif) & (x < uof) else 1)


y_true = df_glass.GOut

print('Identified outliers: %s' % y_pred.value_counts()[0])
print('True outliers: %s' % y_true.value_counts()[0])

print('---' * 6)

print('Precision: %s' % round(average_precision_score(y_true, y_pred), 3))
print('Recall: %s' % round(recall_score(y_true, y_pred), 3))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=0)
print('AUC: %s' % round(auc(fpr, tpr), 3))

precision, recall, _ = precision_recall_curve(y_true, y_pred)
average_precision = average_precision_score(y_true, y_pred)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.05])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))

plt.show()









    



Identified outliers: 9
True outliers: 9
------------------
Precision: 0.956
Recall: 0.956
AUC: 0.522



In [36]:

    
plt.figure(figsize=(10,5))

y_mild_outliers = df_glass.RI

y_pred = pd.Series(y_mild_outliers)
# inverse_lof of -1 => normal, while outliers have a smaller number
y_pred = y_pred.apply(lambda x: 0 if x >= uof else 1)


y_true = df_glass.GOut

print('Identified outliers: %s' % y_pred.value_counts()[0])
print('True outliers: %s' % y_true.value_counts()[0])

print('---' * 6)

print('Precision: %s' % round(average_precision_score(y_true, y_pred), 3))
print('Recall: %s' % round(recall_score(y_true, y_pred), 3))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=0)
print('AUC: %s' % round(auc(fpr, tpr), 3))

precision, recall, _ = precision_recall_curve(y_true, y_pred)
average_precision = average_precision_score(y_true, y_pred)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.05])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))

plt.show()









    



Identified outliers: 5
True outliers: 9
------------------
Precision: 0.957
Recall: 0.976
AUC: 0.512

Mahalanobis



In [37]:

    
x = df_glass['Mg']
y = df_glass['RI']

X = np.vstack([x,y])
V = np.cov(X.T)
vi = inv(V) # np.linalg.inv

distance.mahalanobis(x, y, vi)









    



---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
<ipython-input-37-3d59379a0441> in <module>()
      4 X = np.vstack([x,y])
      5 V = np.cov(X.T)
----> 6 vi = inv(V) # np.linalg.inv
      7 
      8 distance.mahalanobis(x, y, vi)

/usr/local/lib/python3.5/dist-packages/scipy/linalg/basic.py in inv(a, overwrite_a, check_finite)
    817         inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1)
    818     if info > 0:
--> 819         raise LinAlgError("singular matrix")
    820     if info < 0:
    821         raise ValueError('illegal value in %d-th argument of internal '

LinAlgError: singular matrix

LOF



In [38]:

    
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

x = df_glass[name1]
y = df_glass[name2]
y_pred = clf.fit_predict(np.array(x,y).reshape(-1, 1))
X_scores = clf.negative_outlier_factor_

print(X_scores[:20])


radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())


trace1 = go.Scatter(
    x = x,
    y = y,
    name = 'data',
    mode = 'markers',
    hoverinfo = 'text',
    text = ['x: %s<br>y: %s' % (x_i, y_i) for x_i, y_i in zip(x, y)]
)

trace2 = go.Scatter(
    x = x,
    y = y,
    name = 'data',
    mode = 'markers',
    hoverinfo = 'text',
    text = ['radius: %s' % round(rad, 2) for rad in radius],
    marker=dict(
        size = 100 * radius,
    )
)
data = [trace1, trace2]


layout = go.Layout(
    xaxis = dict({'title': name1}),
    yaxis = dict({'title': name2})
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, layout)









    



[-1.00408343 -0.95669141 -0.98494003 -0.95647426 -1.04709688 -0.96309983
 -1.03891049 -0.96440909 -1.12064995 -0.98905    -1.20515743 -0.95295927
 -1.04744413 -1.00326658 -0.95295927 -0.95669141 -1.0620618  -0.96850966
 -1.14023475 -1.1470066 ]

LOF - Precision / Recall



In [39]:

    
plt.figure(figsize=(10,5))

y_pred = pd.Series(X_scores)
# inverse_lof of -1 => normal, while outliers have a smaller number
y_pred = y_pred.apply(lambda x: 0 if x < -2.5 else 1)


y_true = df_glass.GOut

print('Identified outliers: %s' % y_pred.value_counts()[0])
print('True outliers: %s' % y_true.value_counts()[0])

print('---' * 6)

print('Precision: %s' % round(average_precision_score(y_true, y_pred), 3))
print('Recall: %s' % round(recall_score(y_true, y_pred), 3))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=0)
print('AUC: %s' % round(auc(fpr, tpr), 3))

precision, recall, _ = precision_recall_curve(y_true, y_pred)
average_precision = average_precision_score(y_true, y_pred)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.05])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))

plt.show()









    



Identified outliers: 16
True outliers: 9
------------------
Precision: 0.964
Recall: 0.932
AUC: 0.423



In [40]:

    
# ROC / AUC curve

plt.figure()

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
# for i in range(2):
#     fpr[i], tpr[i], _ = roc_curve(y_pred[:, i], y_true[:, i])
#     roc_auc[i] = auc(fpr[i], tpr[i])

fpr, tpr, _ = roc_curve(y_pred, y_true)
roc_auc = auc(fpr, tpr)


lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()



In [ ]:

Precision / Recall

compare outliers from every method extracted to labels in the dataset (not some other measure to identify outliers)



In [41]:

    
average_precision = average_precision_score(y_pred, X_scores)

average_precision









    Out[41]:





0.9999999999999999



In [42]:

    
from sklearn.preprocessing import label_binarize

Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
                                                    random_state=random_state)

from sklearn.multiclass import OneVsRestClassifier

classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-42-159f53708464> in <module>()
      1 from sklearn.preprocessing import label_binarize
      2 
----> 3 Y = label_binarize(y, classes=[0, 1, 2])
      4 n_classes = Y.shape[1]
      5 

/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py in label_binarize(y, classes, neg_label, pos_label, sparse_output)
    517     else:
    518         raise ValueError("%s target data is not supported with label "
--> 519                          "binarization" % y_type)
    520 
    521     if not sparse_output:

ValueError: continuous target data is not supported with label binarization



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [43]:

    
# fit the model
clf = IsolationForest(
    # behaviour='new',
    max_samples=100,
    random_state=22, contamination='auto')

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html



In [44]:

    
clf.fit(X_train)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-44-37d7e92c06aa> in <module>()
----> 1 clf.fit(X_train)

/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/iforest.py in fit(self, X, y, sample_weight)
    202 
    203         self.threshold_ = -sp.stats.scoreatpercentile(
--> 204             -self.decision_function(X), 100. * (1. - self.contamination))
    205 
    206         return self

TypeError: unsupported operand type(s) for -: 'float' and 'str'



In [45]:

    
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-45-a9a81f94b818> in <module>()
----> 1 y_pred_train = clf.predict(X_train)
      2 y_pred_test = clf.predict(X_test)
      3 y_pred_outliers = clf.predict(X_outliers)

/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/iforest.py in predict(self, X)
    224         X = check_array(X, accept_sparse='csr')
    225         is_inlier = np.ones(X.shape[0], dtype=int)
--> 226         is_inlier[self.decision_function(X) <= self.threshold_] = -1
    227         return is_inlier
    228 

AttributeError: 'IsolationForest' object has no attribute 'threshold_'



In [46]:

    
# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)



In [47]:

    
plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white',
                 s=20, edgecolor='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green',
                 s=20, edgecolor='k')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red',
                s=20, edgecolor='k')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
           ["training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper left")
plt.show()









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-47-689dcd68ed74> in <module>()
      2 plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
      3 
----> 4 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white',
      5                  s=20, edgecolor='k')
      6 b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green',

TypeError: list indices must be integers or slices, not tuple



In [ ]:



In [ ]:



In [ ]:

	patient_id	slope_of_peak_exercise_st_segment	thal	resting_blood_pressure	chest_pain_type	num_major_vessels	resting_ekg_results	serum_cholesterol_mg_per_dl	oldpeak_eq_st_depression	sex	age	max_heart_rate_achieved	exercise_induced_angina
0	0z64un	1	normal	128	2	0	2	308	0.0	1	45	170	0
1	ryoo3j	2	normal	110	3	0	0	214	1.6	0	54	158	0
2	yt1s1x	1	normal	125	4	3	2	304	0.0	1	77	162	1
3	l2xjde	1	reversible_defect	152	4	0	0	223	0.0	1	40	181	0
4	oyt4ek	3	reversible_defect	178	1	0	2	270	4.2	1	59	145	0

	RI	Na	Mg	Al	Si	K	Ca	GType	GOut
0	1.52101	13.64	4.49	1.10	71.78	0.06	8.75	1	1
1	1.51761	13.89	3.60	1.36	72.73	0.48	7.83	1	1
2	1.51618	13.53	3.55	1.54	72.99	0.39	7.78	1	1
3	1.51766	13.21	3.69	1.29	72.61	0.57	8.22	1	1
4	1.51742	13.27	3.62	1.24	73.08	0.55	8.07	1	1