In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.feature_extraction import DictVectorizer
import os
import pandas as pd
import time
import scipy
from scipy import stats
import sys

In [18]:
# dataDir = '/Users/Gabriel/Dropbox/Research/DICE/Anomaly Detection/Usecases/POSIDONIA'
# data = os.path.join(dataDir, 'cep_man_labeled.csv')

dataDir = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data'
data = os.path.join(dataDir, 'CEP_Complete_Labeled_Extended.csv')

In [19]:
df = pd.read_csv(data)
df.set_index('key', inplace=True)
dropList = ['host']
print "Droped columns are: %s" %dropList
df = df.drop(dropList, axis=1)
print "Index Name: %s" %df.index.name


Droped columns are: ['host']
Index Name: key

In [20]:
print "Dataframe shape (row, col): %s" %str(df.shape)


Dataframe shape (row, col): (4702, 5)

In [21]:
#encode dataframe
col = []
for el, v in df.dtypes.iteritems():
    # print el
    if v == 'object':
        col.append(el)
col

from sklearn.feature_extraction import DictVectorizer
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec

df, t, v = ohEncoding(df, col, replace=True)

print df.shape


(6444, 15)

In [22]:
features = df.columns[:-1]


print "Detected Features are: %s" %features

 
X = df[features]
# Target always last column of dataframe
y = df.iloc[:,-1].values
print y


Detected Features are: Index([u'ms', u'ship', u'Anomaly', u'component=AIS_SENTENCE_LISTENER',
       u'component=RETRACT_OLD_AISGEOMDATA', u'component=SESSION',
       u'component=SIMPLE_ANCHOR_IN', u'component=SIMPLE_DOCK_START_OUT',
       u'component=SIMPLE_DOCK_STOP', u'component=STOP_OVER_IN',
       u'component=STOP_OVER_OUT', u'method=FIRE_ALL_RULES',
       u'method=HANDLE_MESSAGE', u'method=RETRACT'],
      dtype='object')
[ 1.  1.  1. ...,  1.  1.  0.]

In [24]:
rfc = RandomForestClassifier(n_jobs=-1, max_features='sqrt', n_estimators=50, oob_score=True)

# if isinstance(rfc, RandomForestClassifier):
#     print "test"

param_grid = {
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [5, 15, 25]
}
start_time_g = time.time()
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
CV_rfc.fit(X, y)
print CV_rfc.best_params_
bestParam = CV_rfc.best_params_
print "Best params for Random Forest: %s" % str(bestParam)
elapsed_time_g = time.time() - start_time_g
print "Grid Search for Random Forest took: %s" %str(elapsed_time_g)


{'max_features': 'auto', 'n_estimators': 200, 'max_depth': 5}
Best params for Random Forest: {'max_features': 'auto', 'n_estimators': 200, 'max_depth': 5}
Grid Search for Random Forest took: 179.549936056

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
# fix random seed for reproducibility
seed = 7
start_time = time.time()
clfKV = RandomForestClassifier(max_depth=bestParam['max_depth'], n_estimators=bestParam['n_estimators'], max_features=bestParam['max_features'], n_jobs=-1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_val_score(clfKV, X, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
elapsed_time = time.time() - start_time
print "Cross validation took: %s" %str(elapsed_time)


Baseline: 100.00% (0.00%)
Cross validation took: 9.33692383766

In [29]:
start_time = time.time()
clf = RandomForestClassifier(max_depth=bestParam['max_depth'], n_estimators=bestParam['n_estimators'], max_features=bestParam['max_features'], n_jobs=-1)
clf.fit(X, y)

# Apply the classifier we trained to the test data (which, remember, it has never seen before)
predict = clf.predict(X)
# print predict

# View the predicted probabilities of the first 10 observations
predProb = clf.predict_proba(X)
# print predProb

score = clf.score(X, y)
print "Training Score Random Forest: %s" %score

# # Create confusion matrix
# print pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])
#
# View a list of the features and their importance scores

expDir = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/experiments'

fimp = list(zip(X, clf.feature_importances_))
print "Feature importance Random Forest Training: "
print fimp
elapsed_time = time.time() - start_time
print "Training Random Forest Took: %s" % str(elapsed_time)
dfimp = dict(fimp)
dfimp = pd.DataFrame(dfimp.items(), columns=['Metric', 'Importance'])
sdfimp = dfimp.sort('Importance', ascending=False)
dfimpCsv = 'Feature_Importance_RF_%s.csv' % 'CEP'
sdfimp.to_csv(os.path.join(expDir, dfimpCsv))


Training Score Random Forest: 1.0
Feature importance Random Forest Training: 
[('ms', 0.071915374255335965), ('ship', 0.1076995522711244), ('Anomaly', 0.011832881494625396), ('component=AIS_SENTENCE_LISTENER', 0.27740578868967797), ('component=RETRACT_OLD_AISGEOMDATA', 0.0017680835533956016), ('component=SESSION', 0.060450688192142221), ('component=SIMPLE_ANCHOR_IN', 0.0057620259298141307), ('component=SIMPLE_DOCK_START_OUT', 0.00029942165682853651), ('component=SIMPLE_DOCK_STOP', 0.050567356476212993), ('component=STOP_OVER_IN', 0.0098633548778294562), ('component=STOP_OVER_OUT', 0.010501388263294253), ('method=FIRE_ALL_RULES', 0.043842489251508458), ('method=HANDLE_MESSAGE', 0.34565299855458009), ('method=RETRACT', 0.002438596533630738)]
Training Random Forest Took: 1.65897488594
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipykernel_launcher.py:30: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)

In [16]:
dt_g = DecisionTreeClassifier()

param_grid_dt = {
    'criterion': ['gini', 'random'],
    'splitter':['best', 'random'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [5, 15, 25, 50, 100],
    'min_sample_split': [2, 5, 10]
}

start_time_d = time.time()
CV_dt = GridSearchCV(estimator=dt_g, param_grid=param_grid_dt, cv=5)
CV_dt.fit(X, y)
print CV_dt.best_params_
bestParam_dt = CV_dt.best_params_
print "Best params for Decision Tree: %s" % str(bestParam_dt)
elapsed_time_dt = time.time() - start_time_d
print "Grid Search for Decision Tree took: %s" %str(elapsed_time_dt)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-5affc87a6e2c> in <module>()
----> 1 dt_g = DecisionTreeClassifier()
      2 
      3 param_grid_dt = {
      4     'criterion': ['gini', 'random'],
      5     'splitter':['best', 'random'],

NameError: name 'DecisionTreeClassifier' is not defined

In [ ]:
dt = DecisionTreeClassifier(criterion=settings["criterion"], splitter=settings["splitter"],
                                    max_features=max_features, max_depth=max_depth,
                                    min_samples_split=float(settings["min_sample_split"]),
                                    min_weight_fraction_leaf=float(settings["min_weight_faction_leaf"]), random_state=settings["random_state"])

dt.fit(X, y)
predict = dt.predict(X)
print "Prediction for Decision Tree Training:"
print predict

predProb = dt.predict_proba(X)
print "Prediction probabilities for Decision Tree Training:"
print predProb

score = dt.score(X, y)
print "Decision Tree Training Score: %s" % str(score)

fimp = list(zip(X, dt.feature_importances_))
print "Feature importance Random Forest Training: "
print fimp
dfimp = dict(fimp)
dfimp = pd.DataFrame(dfimp.items(), columns=['Metric', 'Importance'])
sdfimp = dfimp.sort('Importance', ascending=False)
dfimpCsv = 'Feature_Importance_%s.csv' % mname
sdfimp.to_csv(os.path.join(self.modelDir, dfimpCsv))

In [45]:
dataDir2 = '/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data'
cep = os.path.join(dataDir, 'cep.csv')

dfCep = pd.read_csv(cep)
dfCep = dfCep.drop(dropList, axis=1)
dfCep.set_index('key', inplace=True)

#encode dataframe
col = []
for el, v in dfCep.dtypes.iteritems():
    # print el
    if v == 'object':
        col.append(el)
col

from sklearn.feature_extraction import DictVectorizer
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec

dfCep, t, v = ohEncoding(dfCep, col, replace=True)
dfCep


Out[45]:
ms ship component=AIS_SENTENCE_LISTENER component=RETRACT_OLD_AISGEOMDATA component=SESSION component=SIMPLE_ANCHOR_IN component=SIMPLE_ANCHOR_OUT component=SIMPLE_DOCK_STOP component=STOP_OVER_IN component=STOP_OVER_OUT method=FIRE_ALL_RULES method=HANDLE_MESSAGE method=RETRACT method=UPDATE_ACTIVE
key
2017-06-22T11:52:54.063Z 899 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.063Z 899 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.063Z 1068 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.063Z 1068 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 283 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 283 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 303 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.066Z 303 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 290 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 290 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 2428 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.067Z 2428 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1064 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1064 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1021 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.076Z 1021 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 407 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 407 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 407 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 447 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 447 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 447 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 366 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 366 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.077Z 366 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.078Z 362 636092524 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 361 636092524 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 361 636092524 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 361 636092524 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:52:54.080Z 363 636092524 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2017-06-22T11:53:04.245Z 766 224161160 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.246Z 199 225366000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.257Z 365 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.257Z 365 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.257Z 297 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.257Z 297 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.258Z 340 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.259Z 303 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.260Z 686 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.261Z 618 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.262Z 755 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.263Z 9483 305965000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.267Z 391 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.268Z 297 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.269Z 295 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 293 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 293 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 680 305965000 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.270Z 680 305965000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.271Z 652 305965000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.272Z 710 305965000 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2017-06-22T11:53:04.273Z 8784 305965000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.274Z 717 224126750 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.275Z 1140 211636100 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 661 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 661 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 294 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.276Z 294 224133320 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.277Z 221 224181370 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2017-06-22T11:53:04.278Z 209 211636100 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0

845 rows × 14 columns


In [47]:
cep_predict = clf.predict(dfCep)
cep_predict
# print predict

# View the predicted probabilities of the first 10 observations

# print predProb


Out[47]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [48]:
#attach to dataframe labels
dfCep['Target'] = cep_predict
dfCep.to_csv(os.path.join(expDir, 'CEP_Anomaly_exp.csv'))

#dump serialized model
import cPickle as pickle
fname = os.path.join(expDir, 'CEP_Model_EXP')
pickle.dump(clf, open(fname, "wb"))

In [ ]: