In [1]:
import os
import pandas as pd
import numpy
dataDir = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data'
data = os.path.join(dataDir, 'cep.csv')
df = pd.read_csv(data)
df.set_index('key', inplace=True)
df.dtypes
Out[1]:
component object
host object
method object
ms float64
ship int64
dtype: object
In [2]:
col = []
for el, v in df.dtypes.iteritems():
# print el
if v == 'object':
col.append(el)
col
Out[2]:
['component', 'host', 'method']
In [3]:
from sklearn.feature_extraction import DictVectorizer
def ohEncoding(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
In [4]:
data, t, v = ohEncoding(df, col, replace=True)
In [35]:
data
Out[35]:
ms
ship
component=AIS_SENTENCE_LISTENER
component=RETRACT_OLD_AISGEOMDATA
component=SESSION
component=SIMPLE_ANCHOR_IN
component=SIMPLE_ANCHOR_OUT
component=SIMPLE_DOCK_STOP
component=STOP_OVER_IN
component=STOP_OVER_OUT
host=1de0ba-cep-00163e2e0a9d
method=FIRE_ALL_RULES
method=HANDLE_MESSAGE
method=RETRACT
method=UPDATE_ACTIVE
key
2017-06-22T11:52:54.063Z
899
305965000
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.063Z
899
305965000
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.063Z
1068
305965000
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.063Z
1068
305965000
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.066Z
283
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.066Z
283
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.066Z
303
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.066Z
303
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.067Z
290
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.067Z
290
305965000
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.067Z
2428
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.067Z
2428
305965000
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.076Z
1064
305965000
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.076Z
1064
305965000
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.076Z
1021
305965000
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.076Z
1021
305965000
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
407
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
407
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
407
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
447
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
447
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
447
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
366
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
366
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.077Z
366
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.078Z
362
636092524
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.080Z
361
636092524
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.080Z
361
636092524
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.080Z
361
636092524
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:52:54.080Z
363
636092524
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2017-06-22T11:53:04.245Z
766
224161160
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.246Z
199
225366000
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.257Z
365
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.257Z
365
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.257Z
297
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.257Z
297
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.258Z
340
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.259Z
303
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.260Z
686
305965000
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.261Z
618
305965000
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.262Z
755
305965000
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.263Z
9483
305965000
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.267Z
391
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.268Z
297
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.269Z
295
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.270Z
293
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.270Z
293
305965000
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.270Z
680
305965000
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.270Z
680
305965000
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.271Z
652
305965000
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.272Z
710
305965000
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
2017-06-22T11:53:04.273Z
8784
305965000
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.274Z
717
224126750
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.275Z
1140
211636100
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.276Z
661
224133320
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.276Z
661
224133320
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.276Z
294
224133320
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.276Z
294
224133320
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.277Z
221
224181370
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
2017-06-22T11:53:04.278Z
209
211636100
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
845 rows × 15 columns
In [5]:
# fit the model
clf = IsolationForest(max_samples='auto', verbose=1, n_jobs=-1, contamination=0.11)
clf.fit(data)
pred = clf.predict(data)
print type(pred)
# print data.shape
# print len(pred)
print pred
anomalies = np.argwhere(pred == -1)
normal = np.argwhere(pred == 1)
print anomalies
print type(anomalies)
# print normal
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-5-c6171a0ec457> in <module>()
1 # fit the model
----> 2 clf = IsolationForest(max_samples='auto', verbose=1, n_jobs=-1, contamination=0.11)
3 clf.fit(data)
4 pred = clf.predict(data)
5 print type(pred)
NameError: name 'IsolationForest' is not defined
In [ ]:
Content source: igabriel85/dmon-adp
Similar notebooks: