In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from mlpy import dtw_std
from scipy.spatial.distance import pdist
from scipy.cluster import hierarchy
%matplotlib inline
pd.options.display.mpl_style='default'
In [2]:
def plot_dtw(series_x, series_y):
dist,cost,path=dtw_std(series_x,series_y,dist_only=False, squared=True)
fig = plt.figure(1,figsize=(12,12))
ax = fig.add_subplot(111)
plot1 = plt.imshow(cost.T, origin='lower', cmap=cm.gray, interpolation='nearest')
plot2 = plt.plot(path[0], path[1], 'w')
xlim = ax.set_xlim((-0.5, cost.shape[0] - 0.5))
ylim = ax.set_ylim((-0.5, cost.shape[1] - 0.5))
plt.show()
def plot_compare(df,labels=[]):
plt.clf()
df[labels].plot(figsize=(12,8))
def score_pair(series_x,series_y):
return dtw_std(series_x,series_y,dist_only=True,squared=True)
In [3]:
df=pd.read_table("samples/cleaned_dipd.tsv",
delimiter='\t',
parse_dates=[0]
)
df.set_index(['sTime'], drop=False, inplace=True)
df['dow'] = df.sTime.apply(lambda t: t.dayofweek )
df['hr'] = df.sTime.apply(lambda t: t.hour)
hourly = df[['dow','hr','dIP-Distinct']].groupby(['dow','hr']).agg([np.mean]).reset_index()
In [4]:
hourly.boxplot(column=['dIP-Distinct'],by='dow',figsize=(12,6),showmeans=True)
Out[4]:
In [5]:
hourly.boxplot(column=['dIP-Distinct'],by='hr',figsize=(24,9),showmeans=True)
Out[5]:
In [6]:
ip_groups=df[['dow','hr','sIP','dIP-Distinct']].groupby(['dow','hr','sIP']).mean()
wide=ip_groups.transpose().stack(['dow','hr']).fillna(0).reset_index().drop('level_0',1)
wide.set_index(['dow','hr'],inplace=True)
In [7]:
wide.plot(legend=False,figsize=(18,9))
Out[7]:
In [8]:
rowvecs=wide.transpose()
dist_matrix=pdist(rowvecs, score_pair )
mat_linkage=hierarchy.linkage(dist_matrix,method='average')
In [9]:
dend=hierarchy.dendrogram(
mat_linkage, orientation='right',
p=12, truncate_mode='level',
labels=rowvecs.index,
distance_sort = True
)
fig=plt.gcf()
fig.set_size_inches(12,36)
plt.show()
In [10]:
plot_compare(wide,labels=['client_134', 'client_1290'])
In [11]:
plot_dtw(wide['client_134'], wide['client_1290'])
In [12]:
plot_compare(wide,labels=['client_215', 'client_978'])
In [13]:
plot_dtw(wide['client_215'], wide['client_978'])