notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd

from scipy import stats
 
from sklearn import svm
import sklearn.preprocessing
from sklearn.decomposition import PCA



In [2]:

    
#read the consolidated data file

fname = 'hotspots_with_average_sensors_data'
data = pd.read_csv(fname+".csv")
# print(data.columns)
data.head()









    Out[2]:






  
    
      
      datetime
      latitude
      longitude
      ta
      wd
      ws
      rh
      fsd
    
  
  
    
      0
      2015-01-01 01:00:00
      -37.9
      145.3
      17.39458
      112.645153
      2.538481
      79.370460
      253.944641
    
    
      1
      2015-01-01 01:00:00
      -32.1
      115.9
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603
    
    
      2
      2015-01-01 02:00:00
      -31.9
      115.9
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603
    
    
      3
      2015-01-01 02:00:00
      -31.9
      115.9
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603
    
    
      4
      2015-01-01 02:00:00
      -31.8
      115.8
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603



In [3]:

    
# get the variables - all columns without latitude, longitude and time
X_raw=data.as_matrix()
X_raw= X_raw[:,3:]

#take a look at it
print(X_raw.shape)



In [4]:

    
# X_raw= X_raw[:1000,1:-1]



In [5]:

    
###### pca - dimention reduction to two dimentian 
pca = PCA(n_components=2)
X=pca.fit_transform(X_raw)

#take a look after pca
print(X[:4,:])









    



[[  40.45350813   17.38145378]
 [  -2.27084475 -104.78279166]
 [  -2.27084475 -104.78279166]
 [  -2.27084475 -104.78279166]]



In [6]:

    
# we will mark the oddest 0.1% of entries as anomalies
outliers_fraction=0.001
model=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,  kernel="rbf", gamma=0.1).fit(X)
print(model)









    



OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.1, kernel='rbf',
      max_iter=-1, nu=0.05095, random_state=None, shrinking=True,
      tol=0.001, verbose=False)



In [7]:

    
#predicted score of a point NOT being an outlier
y_pred = model.decision_function(X).ravel()



In [8]:

    
##potentially we can let user to change the treshhold
threshold = stats.scoreatpercentile(y_pred,100 * outliers_fraction)
print(threshold)









    



-0.000480528529984



In [9]:

    
y_pred = y_pred <= threshold
# print(y_pred)



In [10]:

    
#save original data and a flag=1 if the point is an outlier
X_new = np.c_[X, y_pred]
# print(min(X[:,1]))
print(X_new[:5, :])









    



[[  40.45350813   17.38145378    0.        ]
 [  -2.27084475 -104.78279166    0.        ]
 [  -2.27084475 -104.78279166    0.        ]
 [  -2.27084475 -104.78279166    0.        ]
 [  -2.27084475 -104.78279166    0.        ]]



In [11]:

    
#plotting the stuff
marg=5
xx, yy = np.meshgrid(np.linspace(min(X[:,0])-marg, max(X[:,0])+marg, 500), np.linspace(min(X[:,1])-marg, max(X[:,1])+marg, 500))
Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure()

#contour lines for outlier detection
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')

#scatterplot of the data

plt.scatter(X_new[X_new[:, 2] == 1][:, 0], X_new[X_new[:, 2] == 1][:, 1], color='red', label='outliers')
plt.scatter(X_new[X_new[:, 2] == 0][:, 0], X_new[X_new[:, 2] == 0][:, 1], color='black', label='inliers')
plt.axis('off')
plt.show()



In [12]:

    
#add a column with outlier flag ot the data
data['outlier']=X_new[:, 2].astype(int)
data.head()









    Out[12]:






  
    
      
      datetime
      latitude
      longitude
      ta
      wd
      ws
      rh
      fsd
      outlier
    
  
  
    
      0
      2015-01-01 01:00:00
      -37.9
      145.3
      17.39458
      112.645153
      2.538481
      79.370460
      253.944641
      0
    
    
      1
      2015-01-01 01:00:00
      -32.1
      115.9
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603
      0
    
    
      2
      2015-01-01 02:00:00
      -31.9
      115.9
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603
      0
    
    
      3
      2015-01-01 02:00:00
      -31.9
      115.9
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603
      0
    
    
      4
      2015-01-01 02:00:00
      -31.8
      115.8
      22.10339
      194.336502
      4.459360
      52.160203
      351.146603
      0



In [13]:

    
#convert output to json and save it
output=data.to_json(orient="records")

f = open('website_data.js', 'w')
f.write('var data = ' + output + ";")
f.close()

	datetime	latitude	longitude	ta	wd	ws	rh	fsd
0	2015-01-01 01:00:00	-37.9	145.3	17.39458	112.645153	2.538481	79.370460	253.944641
1	2015-01-01 01:00:00	-32.1	115.9	22.10339	194.336502	4.459360	52.160203	351.146603
2	2015-01-01 02:00:00	-31.9	115.9	22.10339	194.336502	4.459360	52.160203	351.146603
3	2015-01-01 02:00:00	-31.9	115.9	22.10339	194.336502	4.459360	52.160203	351.146603
4	2015-01-01 02:00:00	-31.8	115.8	22.10339	194.336502	4.459360	52.160203	351.146603