In [1]:
%matplotlib inline
from time import time
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from pandas.tseries.offsets import *
import simulated_data
In [2]:
# parameters of simulated data generation
n_series = 6
# lenghts of subject and reference time periods
refh = 12
subh = 1
# probability to correctly classify sample based purely on luck
chance = refh/(subh+refh)
# how much better than luck we want to be to say we detected an anomaly. Default is 5%
cut = chance + (1-chance) * 0.05
print('chance:',chance, '\tcut:', cut)
ref = refh * Hour()
sub = subh * Hour()
# number of training epochs
epochs=60
In [3]:
df = simulated_data.get_simulated_data()
# df = simulated_data.get_simulated_fixed_data()
df.head()
Out[3]:
In [4]:
ax = df.plot(figsize=(20,7))
ax.set_xlabel("time", fontsize=14)
Out[4]:
In [5]:
def getModel():
model = Sequential()
model.add(Dense(units=n_series, input_shape=(n_series,), activation='relu' ))
# model.add(Dropout(0.5))
model.add(Dense(units=n_series, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid') )
model.compile(loss='binary_crossentropy',optimizer='rmsprop', metrics=['accuracy'])
# model.compile(loss='hinge', optimizer='sgd', metrics=['binary_accuracy'])
# model.compile(loss='mse',optimizer='rmsprop', metrics=['accuracy'])
# model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['binary_accuracy'])
return model
def plotHist(hist):
es=len(hist.history['loss'])
x = np.linspace(0,es-1,es)
plt.plot(x, hist.history['loss'], '--', linewidth=2, label='loss')
plt.plot(x, hist.history['acc'], '-', linewidth=2, label='acc')
plt.legend()
plt.show()
In [6]:
def check_for_anomaly(ref, sub, count):
y_ref = pd.DataFrame([0] * ref.shape[0])
y_ref.index=ref.index
X_ref=ref
del X_ref['flag']
del X_ref['score']
y_sub = pd.DataFrame([1] * sub.shape[0])
y_sub.index=sub.index
X_sub=sub
del X_sub['flag']
del X_sub['score']
# separate Reference and Subject into Train and Test
X_ref_train, X_ref_test, y_ref_train, y_ref_test = train_test_split(X_ref, y_ref, test_size=0.3, random_state=42)
X_sub_train, X_sub_test, y_sub_train, y_sub_test = train_test_split(X_sub, y_sub, test_size=0.3, random_state=42)
# combine training ref and sub samples
X_train = pd.concat([X_ref_train, X_sub_train])
y_train = pd.concat([y_ref_train, y_sub_train])
# combine testing ref and sub samples
X_test = pd.concat([X_ref_test, X_sub_test])
y_test = pd.concat([y_ref_test, y_sub_test])
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_train_s, y_train_s = shuffle(X_train, y_train)
m=getModel()
hist = m.fit(X_train_s.values, y_train_s.values, epochs=epochs, verbose=0, shuffle=True, batch_size=256)
loss_and_metrics = m.evaluate(X_test.values, y_test.values)#, batch_size=256)
#print(loss_and_metrics)
if loss_and_metrics[1] > cut:# or not count%5:
plotHist(hist)
return loss_and_metrics[1]
In [7]:
df['score']=0.5
#find min and max timestamps
start = df.index.min()
end = df.index.max()
#round start
start.seconds=0
start.minutes=0
# loop over them
ti=start+ref+sub
count=0
while ti < end + 1 * Minute():
print(count)
startt = time()
ref_start = ti-ref-sub
ref_end = ti-sub
ref_df = df[(df.index >= ref_start) & (df.index < ref_end)]
sub_df = df[(df.index >= ref_end) & (df.index < ti)]
score = check_for_anomaly(ref_df, sub_df, count)
df.loc[(df.index>=ref_end) & (df.index<=ti),['score']] = score
print('\n',ti,"\trefes:" , ref_df.shape[0], "\tsubjects:", sub_df.shape[0], '\tscore:', score)
ti = ti + sub
count=count+1
endt=time()
print("took:", endt-startt)
# if count>2: break
In [8]:
ax = df.plot(figsize=(20,7))
ax.set_xlabel("time", fontsize=14)
plt.savefig('ANN_simulated_score.png')
In [9]:
fig, ax = plt.subplots(figsize=(20,7))
ax.set_xlabel("time", fontsize=14)
df.loc[:,'Detected'] = 0
df.loc[df.score>cut,'Detected']=1
df.head()
ax.plot(df.flag, 'r')
ax.plot(df.score,'g')
ax.fill( df.Detected, 'b', alpha=0.3)
ax.legend(loc='upper left')
plt.show()
fig.savefig('ANN_simulated_shaded.png')