In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn import svm
from sklearn.decomposition import PCA
In [2]:
#read the consolidated data file
'''for the purpose of this demo we use a fake datafile generated in fake_data_generation'''
fname = 'fakedata'
data = pd.read_csv(fname+".csv")
# print(data.columns)
data.head()
Out[2]:
In [3]:
# get the variables - all columns without latitude, longitude and time
X_raw=data.as_matrix()
X_raw= X_raw[:,3:]
#look at it
print(X_raw[:6,:])
###### pca - dimention reduction to two dimentian
pca = PCA(n_components=2)
X=pca.fit_transform(X_raw)
In [4]:
# we will mark the oddest 10% of entries as anomalies
outliers_fraction=0.1
model=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1).fit(X)
print(model)
In [5]:
#predicted score of a point NOT being an outlier
y_pred = model.decision_function(X).ravel()
In [6]:
##potentially we can let user to change the treshhold
threshold = stats.scoreatpercentile(y_pred,100 * outliers_fraction)
print(threshold)
In [7]:
y_pred = y_pred <= threshold
# print(y_pred)
In [8]:
#save original data and a flag=1 if the point is an outlier
X_new = np.c_[X, y_pred]
# print(min(X[:,1]))
print(X_new[:5, :])
In [9]:
#plotting the stuff
marg=5
xx, yy = np.meshgrid(np.linspace(min(X[:,0])-marg, max(X[:,0])+marg, 500), np.linspace(min(X[:,1])-marg, max(X[:,1])+marg, 500))
Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
#contour lines for outlier detection
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
#scatterplot of the data
plt.scatter(X_new[X_new[:, 2] == 1][:, 0], X_new[X_new[:, 2] == 1][:, 1], color='red', label='inliers')
plt.scatter(X_new[X_new[:, 2] == 0][:, 0], X_new[X_new[:, 2] == 0][:, 1], color='black', label='inliers')
plt.axis('off')
plt.show()
In [10]:
#add a column with outlier flag ot the data
data['outlier']=X_new[:, 2].astype(int)
data.head()
Out[10]:
In [11]:
#convert output to json and save it
output=data.to_json(orient="records")
f = open('website_data.js', 'w')
f.write('var outlier = ' + output + ";")
f.close()