TS Similarity - Distinct IPs Accessed Across Perimeter by Hour


In [1]:
import pandas            as pd
import matplotlib.pyplot as plt
import matplotlib.cm     as cm
import numpy             as np
from   mlpy                   import dtw_std
from   scipy.spatial.distance import pdist
from   scipy.cluster          import hierarchy

%matplotlib inline
pd.options.display.mpl_style='default'

In [2]:
def plot_dtw(series_x, series_y):
    dist,cost,path=dtw_std(series_x,series_y,dist_only=False, squared=True)
    fig = plt.figure(1,figsize=(12,12))
    ax  = fig.add_subplot(111)
    plot1 = plt.imshow(cost.T, origin='lower', cmap=cm.gray, interpolation='nearest')
    plot2 = plt.plot(path[0], path[1], 'w')
    xlim  = ax.set_xlim((-0.5, cost.shape[0] - 0.5))
    ylim  = ax.set_ylim((-0.5, cost.shape[1] - 0.5))
    plt.show()

def plot_compare(df,labels=[]):
    plt.clf()
    df[labels].plot(figsize=(12,8))
    
def score_pair(series_x,series_y):
    return dtw_std(series_x,series_y,dist_only=True,squared=True)

In [3]:
df=pd.read_table("samples/cleaned_dipd.tsv", 
                 delimiter='\t', 
                 parse_dates=[0]
                 )
df.set_index(['sTime'], drop=False, inplace=True)
df['dow'] = df.sTime.apply(lambda t: t.dayofweek )
df['hr']  = df.sTime.apply(lambda t: t.hour)

hourly     = df[['dow','hr','dIP-Distinct']].groupby(['dow','hr']).agg([np.mean]).reset_index()

In [4]:
hourly.boxplot(column=['dIP-Distinct'],by='dow',figsize=(12,6),showmeans=True)


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x108a48cd0>

In [5]:
hourly.boxplot(column=['dIP-Distinct'],by='hr',figsize=(24,9),showmeans=True)


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x10788f0d0>

In [6]:
ip_groups=df[['dow','hr','sIP','dIP-Distinct']].groupby(['dow','hr','sIP']).mean()
wide=ip_groups.transpose().stack(['dow','hr']).fillna(0).reset_index().drop('level_0',1)
wide.set_index(['dow','hr'],inplace=True)

In [7]:
wide.plot(legend=False,figsize=(18,9))


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b8a1990>

In [8]:
rowvecs=wide.transpose()
dist_matrix=pdist(rowvecs, score_pair )
mat_linkage=hierarchy.linkage(dist_matrix,method='average')

In [9]:
dend=hierarchy.dendrogram(
    mat_linkage, orientation='right',
    p=12,        truncate_mode='level',
    labels=rowvecs.index,
    distance_sort = True
)
fig=plt.gcf()
fig.set_size_inches(12,36)
plt.show()



In [10]:
plot_compare(wide,labels=['client_134', 'client_1290'])


<matplotlib.figure.Figure at 0x10f569b90>

In [11]:
plot_dtw(wide['client_134'], wide['client_1290'])



In [12]:
plot_compare(wide,labels=['client_215', 'client_978'])


<matplotlib.figure.Figure at 0x10d7289d0>

In [13]:
plot_dtw(wide['client_215'], wide['client_978'])