In [1]:
import pandas as pd
from IPython.display import display
from IPython.display import Image
import os

dataDir = "/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data"

In [2]:
df = pd.read_csv(os.path.join(dataDir, "Storm_Complete_labeled.csv"))
print df.shape
# df2 = pd.read_csv(os.path.join(dataDir, "Storm_anomalies_Clustered.csv"))
# df_nokey = df.drop("key", 1)
# print df_nokey.shape
# print df2.shape
df.dtypes


(60218, 56)
Out[2]:
key                                    int64
bolts_0_acked                        float64
topologyStats_all_transferred        float64
bolts_0_tasks                        float64
bolts_0_failed                       float64
bolts_0_executors                    float64
topologyStats_3h_window              float64
topologyStats_all_acked              float64
topologyStats_all_completeLatency    float64
topologyStats_all_emitted            float64
topologyStats_all_failed             float64
topologyStats_3h_emitted             float64
topologyStats_3h_failed              float64
bolts_1_executeLatency               float64
bolts_1_processLatency               float64
bolts_1_tasks                        float64
bolts_1_transferred                  float64
bolts_1_executors                    float64
bolts_1_failed                       float64
bolts_1_emitted                      float64
bolts_1_executed                     float64
bolts_1_acked                        float64
bolts_1_capacity                     float64
msgTimeout                           float64
executorsTotal                       float64
workersTotal                         float64
tasksTotal                           float64
topologyStats_10m_completeLatency    float64
topologyStats_10m_acked              float64
topologyStats_10m_failed             float64
topologyStats_10m_emitted            float64
topologyStats_1d_completeLatency     float64
topologyStats_1d_acked               float64
bolts_0_emitted                      float64
topologyStats_1d_emitted             float64
topologyStats_10m_window             float64
topologyStats_10m_transferred        float64
bolts_0_processLatency               float64
bolts_0_executeLatency               float64
topologyStats_1d_failed              float64
topologyStats_3h_transferred         float64
topologyStats_1d_window              float64
topologyStats_1d_transferred         float64
topologyStats_3h_completeLatency     float64
spouts_0_tasks                       float64
bolts_0_capacity                     float64
spouts_0_completeLatency             float64
spouts_0_emitted                     float64
spouts_0_acked                       float64
spouts_0_failed                      float64
spouts_0_executors                   float64
bolts_0_executed                     float64
spouts_0_transferred                 float64
bolts_0_transferred                  float64
topologyStats_3h_acked               float64
Target                                 int64
dtype: object

In [3]:
# add only anomalies to clustering
data_labeled = df[df["Target"] == -1]
data_labeled.set_index('key', inplace=True)
print data_labeled.shape


(6624, 55)

In [4]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(data_labeled)
db = DBSCAN(eps=0.9, min_samples=40).fit(X)
labels = db.labels_

In [5]:
print labels
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print n_clusters_


[-1 -1 -1 ..., 13  9 -1]
19

In [6]:
import numpy as np
# get biggest value
bval = np.amax(labels)
print bval
# replace noise with new label
labels[labels == -1] = bval+1
# add one to all elements in array so that 0 is free for normal events
nlabels = labels +1
data_labeled['Target2'] = nlabels
print data_labeled[['Target','Target2']]


18
               Target  Target2
key                           
1497445890000      -1       20
1497445900000      -1       20
1497445910000      -1       20
1497445920000      -1       20
1497445930000      -1       20
1497445940000      -1       20
1497445950000      -1       20
1497445960000      -1       20
1497445970000      -1       20
1497445980000      -1       20
1497445990000      -1       20
1497446000000      -1       20
1497446010000      -1       20
1497446020000      -1       20
1497446030000      -1       20
1497446040000      -1       20
1497446050000      -1       20
1497446060000      -1       20
1497446070000      -1       20
1497446080000      -1       20
1497446090000      -1       20
1497446100000      -1       20
1497446110000      -1       20
1497446120000      -1       20
1497446130000      -1       20
1497446140000      -1       20
1497446150000      -1       20
1497446160000      -1       20
1497446170000      -1       20
1497446180000      -1       20
...               ...      ...
1498047770000      -1       20
1498047780000      -1       20
1498047790000      -1       20
1498047800000      -1       20
1498047810000      -1       20
1498047820000      -1       20
1498047830000      -1       20
1498047840000      -1       20
1498047850000      -1       20
1498047860000      -1       14
1498047870000      -1       20
1498047880000      -1       17
1498047890000      -1       14
1498047900000      -1       10
1498047910000      -1       17
1498047920000      -1       14
1498047930000      -1       20
1498047940000      -1       17
1498047950000      -1       14
1498047960000      -1       10
1498047970000      -1       15
1498047980000      -1       14
1498047990000      -1       10
1498048000000      -1       15
1498048010000      -1       14
1498048020000      -1       10
1498048030000      -1       20
1498048040000      -1       14
1498048050000      -1       10
1498048060000      -1       20

[6624 rows x 2 columns]
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [7]:
normal = np.argwhere(nlabels == 0)
normal

data_labeled.index.values


Out[7]:
array([1497445890000, 1497445900000, 1497445910000, ..., 1498048040000,
       1498048050000, 1498048060000])

In [8]:
df.set_index('key', inplace=True)
# initialize empty column
df['TargetF'] = np.nan

# add clustered anomalies to original dataframe
for k in data_labeled.index.values:
    df.set_value(k,'TargetF', data_labeled.loc[k, 'Target2'])    


# sentinel = 0
# for i, row in df.iterrows():
#     if sentinel > 20:
#         break
#     else:
#         print i
#         sentinel += 1

In [11]:
# df['TargetF']
# Mark all normal instances as 0
df = df.fillna(0)
df.isnull().values.any()


Out[11]:
False

In [13]:
df = df.drop(['Target'], axis=1)
df.to_csv(os.path.join(dataDir, 'my.csv'))

In [ ]: