In [1]:
import os
import pandas as pd
import numpy
dataDir = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data'
data = os.path.join(dataDir, 'cep.csv')

df = pd.read_csv(data)
df.set_index('key', inplace=True)
df.dtypes


Out[1]:
component     object
host          object
method        object
ms           float64
ship           int64
dtype: object

In [2]:
col = []
for el, v in df.dtypes.iteritems():
    # print el
    if v == 'object':
        col.append(el)
col


Out[2]:
['component', 'host', 'method']

In [3]:
from sklearn.feature_extraction import DictVectorizer
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec

In [4]:
data, t, v = ohEncoding(df, col, replace=True)

In [35]:
data


Out[35]:
ms ship component=AIS_SENTENCE_LISTENER component=RETRACT_OLD_AISGEOMDATA component=SESSION component=SIMPLE_ANCHOR_IN component=SIMPLE_ANCHOR_OUT component=SIMPLE_DOCK_STOP component=STOP_OVER_IN component=STOP_OVER_OUT host=1de0ba-cep-00163e2e0a9d method=FIRE_ALL_RULES method=HANDLE_MESSAGE method=RETRACT method=UPDATE_ACTIVE
key
2017-06-22T11:52:54.063Z 899 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.063Z 899 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.063Z 1068 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.063Z 1068 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 283 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 283 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 303 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 303 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 290 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 290 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 2428 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 2428 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1064 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1064 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1021 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1021 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 407 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 407 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 407 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 447 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 447 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 447 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 366 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 366 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 366 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.078Z 362 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 361 636092524 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 361 636092524 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 361 636092524 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 363 636092524 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2017-06-22T11:53:04.245Z 766 224161160 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.246Z 199 225366000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.257Z 365 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.257Z 365 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.257Z 297 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.257Z 297 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.258Z 340 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.259Z 303 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.260Z 686 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.261Z 618 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.262Z 755 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.263Z 9483 305965000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.267Z 391 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.268Z 297 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.269Z 295 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 293 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 293 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 680 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 680 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.271Z 652 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.272Z 710 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.273Z 8784 305965000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.274Z 717 224126750 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.275Z 1140 211636100 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 661 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 661 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 294 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 294 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.277Z 221 224181370 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.278Z 209 211636100 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0

845 rows × 15 columns


In [5]:
# fit the model
clf = IsolationForest(max_samples='auto', verbose=1, n_jobs=-1, contamination=0.11)
clf.fit(data)
pred = clf.predict(data)
print type(pred)
# print data.shape
# print len(pred)
print pred
anomalies = np.argwhere(pred == -1)
normal = np.argwhere(pred == 1)
print anomalies
print type(anomalies)
# print normal


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-c6171a0ec457> in <module>()
      1 # fit the model
----> 2 clf = IsolationForest(max_samples='auto', verbose=1, n_jobs=-1, contamination=0.11)
      3 clf.fit(data)
      4 pred = clf.predict(data)
      5 print type(pred)

NameError: name 'IsolationForest' is not defined

In [ ]: