Novelty detection using 1-class SVM

Classifies new data as similar or different to the training set. This method is an unsupervised method that builds a decision boundary between the data and origin in kernel space and can be used as a novelty detector.



In [38]:

    
# Setup
%load_ext sql
# %sql postgresql://gpdbchina@10.194.10.68:55000/madlib
%sql postgresql://fmcquillan@localhost:5432/madlib
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager









    



The sql extension is already loaded. To reload it, use:
  %reload_ext sql



In [39]:

    
# Generate train data
X = 0.3 * np.random.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
X_train_D = pd.DataFrame(X_train, columns=['x1', 'x2'])

# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-7, high=7, size=(40, 2))
X_outliers_D = pd.DataFrame(X_outliers, columns=['x1', 'x2'])

b = plt.scatter(X_train[:, 0], X_train[:, 1], c='blue')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.show()



In [40]:

    
# Build tables
%sql DROP TABLE IF EXISTS X_train_D CASCADE
%sql PERSIST X_train_D
%sql ALTER TABLE X_train_D add column X float[]
%sql update X_train_D set X = array[x1, x2]::float[]

%sql DROP TABLE IF EXISTS X_outliers_D CASCADE
%sql PERSIST X_outliers_D
%sql ALTER TABLE X_outliers_D add column X float[]
%sql update X_outliers_D set X = array[x1, x2]::float[]









    



Done.
Done.
200 rows affected.
Done.
Done.
40 rows affected.






    Out[40]:





[]



In [41]:

    
%%sql 
-- Train the model
DROP TABLE IF EXISTS svm_out1, svm_out1_summary, svm_out1_random CASCADE;
SELECT madlib.svm_one_class(
    'X_train_D',    -- source table
    'svm_out1',     -- output table
    'X',            -- features
    'gaussian',     -- kernel
    'gamma=1, n_components=55, random_state=3', 
    NULL,           -- grouping 
    'init_stepsize=0.1, lambda=10, max_iter=100, tolerance=0'  
    );
SELECT * FROM svm_out1;









    



Done.
1 rows affected.
1 rows affected.






    Out[41]:





    
        coef
        loss
        norm_of_gradient
        num_iterations
        num_rows_processed
        num_rows_skipped
        dep_var_mapping
    
    
        [-0.0464306799936, -0.140449420067, -0.764612793969, 0.0561685540245, -0.252354327958, 0.0495049204882, 0.391554812087, -0.703309532619, 0.368742803259, -0.930454424768, 0.491168162198, -0.0386557437811, -0.140089009113, -0.069960677275, -0.0591263658714, -0.739785078271, -0.0294268532071, -0.908443499292, -0.670068256829, -0.585115814628, -0.127288150455, -0.132263939273, -0.63193927756, -0.267746761158, -0.308319881721, -0.112196082983, 0.314925255979, 0.489853397793, 0.558233085441, -0.317663291248, -0.0283645416631, 0.351683891295, 0.0420027146589, 0.598375924694, 0.440495824996, 0.32693078572, 0.0826075162544, -0.0926654621441, -0.608400366479, -0.103107469427, 0.625755815962, 0.111294051421, 0.503191159649, 0.561639019968, 0.146793229579, 0.343509390648, -0.0539117933362, 0.111156536713, 0.696593342258, 0.749628460803, -0.245320818382, -0.126394405676, 0.330116990245, -0.292737229582, -0.471647870958, -1.00004273785]
        81.1574633093
        113.632707563
        100
        201
        -1
        [-1.0, 1.0]



In [43]:

    
# Prediction
# First for the training data
%sql drop table if exists y_pred_train;
%sql SELECT madlib.svm_predict('svm_out1', 'X_train_D', 'index', 'y_pred_train');
y_pred_train = %sql SELECT * from y_pred_train; 

# Next for the outliers
%sql drop table if exists y_pred_outliers;
%sql SELECT madlib.svm_predict('svm_out1', 'X_outliers_D', 'index', 'y_pred_outliers');
y_pred_outliers = %sql SELECT * from y_pred_outliers; 

%sql SELECT * FROM y_pred_outliers limit 20; -- Show the outliers
#%sql SELECT * FROM y_pred_train limit 20; -- Show the training data









    



Done.
1 rows affected.
200 rows affected.
Done.
1 rows affected.
40 rows affected.
20 rows affected.






    Out[43]:





    
        index
        prediction
        decision_function
    
    
        0
        -1.0
        -1.4388730148
    
    
        1
        -1.0
        -0.724957901147
    
    
        2
        -1.0
        -0.372905109874
    
    
        3
        -1.0
        -1.22121754272
    
    
        4
        -1.0
        -0.192345798702
    
    
        5
        -1.0
        -1.07810614716
    
    
        6
        -1.0
        -0.290357395599
    
    
        7
        -1.0
        -1.45117129021
    
    
        8
        -1.0
        -1.82642387932
    
    
        9
        -1.0
        -0.843193638379
    
    
        10
        -1.0
        -1.02880031223
    
    
        11
        -1.0
        -1.87290888498
    
    
        12
        -1.0
        -1.01253395305
    
    
        13
        -1.0
        -1.69237718475
    
    
        14
        -1.0
        -1.22062554442
    
    
        15
        -1.0
        -1.35157679945
    
    
        16
        -1.0
        -0.167893081095
    
    
        17
        -1.0
        -1.26814530164
    
    
        18
        -1.0
        -0.383226373509
    
    
        19
        -1.0
        -0.141301479149



In [44]:

    
# Predict over the decision grid for plotting
# xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
grid_points = pd.DataFrame(np.c_[xx.ravel(), yy.ravel()], columns=['x1', 'x2'])

%sql DROP TABLE IF EXISTS grid_points CASCADE
%sql PERSIST grid_points
%sql ALTER TABLE grid_points add column X float[]
%sql update grid_points set X = array[x1, x2]::float[]









    



Done.
Done.
10000 rows affected.






    Out[44]:





[]



In [47]:

    
# Plot the decision grid
%sql drop table if exists Z_D;
%sql SELECT madlib.svm_predict('svm_out1', 'grid_points', 'index', 'Z_D');
Z_D = %sql SELECT decision_function from Z_D order by index
Z = np.array(Z_D)
Z = Z.reshape(xx.shape)

# Orange is not novel, green is novel
plt.title("Novelty Detection")
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='orange')
plt.contourf(xx, yy, Z, levels=[Z.min(), 0], colors='green')
#plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), Z.max(), 7), cmap=plt.cm.Blues_r)
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='blue')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.show()









    



Done.
1 rows affected.
10000 rows affected.



In [ ]:

index	prediction	decision_function
0	-1.0	-1.4388730148
1	-1.0	-0.724957901147
2	-1.0	-0.372905109874
3	-1.0	-1.22121754272
4	-1.0	-0.192345798702
5	-1.0	-1.07810614716
6	-1.0	-0.290357395599
7	-1.0	-1.45117129021
8	-1.0	-1.82642387932
9	-1.0	-0.843193638379
10	-1.0	-1.02880031223
11	-1.0	-1.87290888498
12	-1.0	-1.01253395305
13	-1.0	-1.69237718475
14	-1.0	-1.22062554442
15	-1.0	-1.35157679945
16	-1.0	-0.167893081095
17	-1.0	-1.26814530164
18	-1.0	-0.383226373509
19	-1.0	-0.141301479149