In [1]:
import pandas as pd
from IPython.display import display
from IPython.display import Image
import os
dataDir = "/Users/Gabriel/Documents/workspaces/diceWorkspace/dmon-adp/data"
In [2]:
df = pd.read_csv(os.path.join(dataDir, "Storm_Complete_labeled.csv"))
print df.shape
# df2 = pd.read_csv(os.path.join(dataDir, "Storm_anomalies_Clustered.csv"))
# df_nokey = df.drop("key", 1)
# print df_nokey.shape
# print df2.shape
df.dtypes
Out[2]:
In [3]:
# add only anomalies to clustering
data_labeled = df[df["Target"] == -1]
data_labeled.set_index('key', inplace=True)
print data_labeled.shape
In [4]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(data_labeled)
db = DBSCAN(eps=0.9, min_samples=40).fit(X)
labels = db.labels_
In [5]:
print labels
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print n_clusters_
In [6]:
import numpy as np
# get biggest value
bval = np.amax(labels)
print bval
# replace noise with new label
labels[labels == -1] = bval+1
# add one to all elements in array so that 0 is free for normal events
nlabels = labels +1
data_labeled['Target2'] = nlabels
print data_labeled[['Target','Target2']]
In [7]:
normal = np.argwhere(nlabels == 0)
normal
data_labeled.index.values
Out[7]:
In [8]:
df.set_index('key', inplace=True)
# initialize empty column
df['TargetF'] = np.nan
# add clustered anomalies to original dataframe
for k in data_labeled.index.values:
df.set_value(k,'TargetF', data_labeled.loc[k, 'Target2'])
# sentinel = 0
# for i, row in df.iterrows():
# if sentinel > 20:
# break
# else:
# print i
# sentinel += 1
In [11]:
# df['TargetF']
# Mark all normal instances as 0
df = df.fillna(0)
df.isnull().values.any()
Out[11]:
In [13]:
df = df.drop(['Target'], axis=1)
df.to_csv(os.path.join(dataDir, 'my.csv'))
In [ ]: