Time Maps

inspiration from https://districtdatalabs.silvrback.com/time-maps-visualizing-discrete-events-across-many-timescales



In [1]:

    
import os; os.sys.path.append(os.path.dirname(os.path.abspath('.'))) # for relative imports
from utils.nab_data import NABData
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [4]:

    
data = NABData()
data.summary().head()









    Out[4]:






  
    
      
      file
      length
      features
      period
      periods_vary
      min
      max
      mean
      std
      25th_percentile
      50th_percentile
      75th_percentile
    
  
  
    
      0
      art_daily_small_noise
      4032
      1
      5m
      1
      18
      87.98
      42.44
      28.08
      19.84
      21.61
      75.45
    
    
      1
      art_daily_perfect_square_wave
      4032
      1
      5m
      1
      20
      80.00
      42.50
      29.05
      20.00
      20.00
      80.00
    
    
      2
      art_daily_no_noise
      4032
      1
      5m
      1
      20
      80.00
      42.50
      27.95
      20.00
      20.29
      79.62
    
    
      3
      art_noisy
      4032
      1
      5m
      1
      8
      19.00
      13.49
      3.15
      10.79
      13.45
      16.23
    
    
      4
      art_flatline
      4032
      1
      5m
      1
      45
      45.00
      45.00
      0.00
      45.00
      45.00
      45.00



In [7]:

    
data.data.keys()









    Out[7]:





['art_daily_small_noise',
 'art_daily_perfect_square_wave',
 'art_daily_no_noise',
 'art_noisy',
 'art_flatline',
 'TravelTime_387',
 'occupancy_t4013',
 'speed_7578',
 'TravelTime_451',
 'speed_t4013',
 'occupancy_6005',
 'speed_6005',
 'Twitter_volume_UPS',
 'Twitter_volume_AAPL',
 'Twitter_volume_IBM',
 'Twitter_volume_CVS',
 'Twitter_volume_PFE',
 'Twitter_volume_GOOG',
 'Twitter_volume_FB',
 'Twitter_volume_CRM',
 'Twitter_volume_KO',
 'Twitter_volume_AMZN',
 'nyc_taxi',
 'cpu_utilization_asg_misconfiguration',
 'machine_temperature_system_failure',
 'rogue_agent_key_updown',
 'ambient_temperature_system_failure',
 'ec2_request_latency_system_failure',
 'rogue_agent_key_hold',
 'art_load_balancer_spikes',
 'art_daily_flatmiddle',
 'art_increase_spike_density',
 'art_daily_nojump',
 'art_daily_jumpsdown',
 'art_daily_jumpsup',
 'exchange-2_cpc_results',
 'exchange-3_cpc_results',
 'exchange-4_cpc_results',
 'exchange-4_cpm_results',
 'exchange-3_cpm_results',
 'exchange-2_cpm_results',
 'ec2_disk_write_bytes_c0d644',
 'ec2_network_in_5abac7',
 'ec2_cpu_utilization_fe7f93',
 'elb_request_count_8c0756',
 'ec2_cpu_utilization_77c1ca',
 'grok_asg_anomaly',
 'ec2_cpu_utilization_24ae8d',
 'ec2_network_in_257a54',
 'rds_cpu_utilization_cc0c53',
 'ec2_cpu_utilization_825cc2',
 'ec2_cpu_utilization_5f5533',
 'rds_cpu_utilization_e47b3b',
 'iio_us-east-1_i-a2eb1cd9_NetworkIn',
 'ec2_cpu_utilization_53ea38',
 'ec2_cpu_utilization_ac20cd',
 'ec2_cpu_utilization_c6585a',
 'ec2_disk_write_bytes_1ef3de']



In [8]:

    
data.plot('nyc_taxi')

the example in the blog post is for a heat map of the amount of time in between events in a sequence.



In [11]:

    
import scipy.ndimage as ndi

df = data['nyc_taxi']

times = df.index

# calculate time differences:
diffs = np.diff(times)

xcoords = diffs[:-1] # all differences except the last
ycoords = diffs[1:] # all differences except the first

Nside=256 # this is the number of bins along x and y for the histogram
width=8 # the width of the Gaussian function along x and y when applying the blur operation

H = np.zeros((Nside,Nside)) # a 'histogram' matrix that counts the number of points in each grid-square

max_diff = np.max(diffs) # maximum time difference

x_heat = (Nside-1)*xcoords/max_diff # the xy coordinates scaled to the size of the matrix
y_heat = (Nside-1)*ycoords/max_diff # subtract 1 since Python starts counting at 0, unlike Fortran and R

for i in range(len(xcoords)): # loop over all points to calculate the population of each bin
    H[x_heat[i], y_heat[i]] += 1 # Increase count by 1
    #here, the integer part of x/y_heat[i] is automatically taken

H = ndi.gaussian_filter(H,width) # apply Gaussian blur
H = np.transpose(H) # so that the orientation is the same as the scatter plot

plt.imshow(H, origin='lower') # display H as an image
plt.show()









    



/home/jstrong/src/envs/tmetrics/lib/python2.7/site-packages/ipykernel/__main__.py:24: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

Our data file includes equal intervals, so this approach is worthless



In [13]:

    
len(set(np.diff(data['nyc_taxi'].index)))









    Out[13]:





1

However, perhaps the axes could reflect time of day and day of week to show mean values for those instead.



In [14]:

    
df.head()









    Out[14]:






  
    
      
      value
    
    
      timestamp
      
    
  
  
    
      2014-07-01 00:00:00
      10844
    
    
      2014-07-01 00:30:00
      8127
    
    
      2014-07-01 01:00:00
      6210
    
    
      2014-07-01 01:30:00
      4656
    
    
      2014-07-01 02:00:00
      3820

This data file is every 30 mins...



In [15]:

    
df.index[0].weekday()









    Out[15]:





1



In [24]:

    
np.unique(df.index.map(lambda x: x.weekday()))









    Out[24]:





array([0, 1, 2, 3, 4, 5, 6])



In [18]:

    
days_legend = dict(zip(range(7), ['Mon','Tue','Wed','Thur','Fri','Sat','Sun']))
days_legend









    Out[18]:





{0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thur', 4: 'Fri', 5: 'Sat', 6: 'Sun'}



In [19]:

    
day_masks = dict(zip(range(7), [df.index.map(lambda x: x.weekday() == d) for d in range(7)]))
df.loc[day_masks[0]].shape









    Out[19]:





(1440, 1)



In [29]:

    
times = np.unique(df.index.map(lambda x: x.time()))



In [30]:

    
tindex = df.index.map(lambda x: x.time())
time_masks = dict(zip(times, [(tindex == t) for t in times]))
k = time_masks.keys()[0]
print k
print df.loc[time_masks[k]].shape



In [61]:

    
dtmap = pd.DataFrame(np.zeros((len(time_masks), len(day_masks))), index=sorted(times), columns=range(7))
print dtmap.shape
fn = np.mean

for day, daymask in day_masks.iteritems():
    for time, timemask in time_masks.iteritems():
        val = fn(df.loc[daymask & timemask])
        dtmap.loc[time, day] = val.values

dtmap.head()









    



(48, 7)






    Out[61]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
    
  
  
    
      00:00:00
      9036.700000
      9636.129032
      12001.451613
      14041.774194
      17392.387097
      23733.612903
      24564.133333
    
    
      00:30:00
      7003.866667
      7331.290323
      9087.870968
      11355.225806
      14489.709677
      22252.516129
      23233.333333
    
    
      01:00:00
      5358.866667
      5740.354839
      7118.903226
      9082.193548
      11836.870968
      20739.806452
      22554.300000
    
    
      01:30:00
      4140.633333
      4316.032258
      5426.354839
      7182.935484
      9350.290323
      18836.677419
      20632.766667
    
    
      02:00:00
      3486.000000
      3440.032258
      4418.000000
      5941.387097
      7673.354839
      17406.677419
      18635.933333



In [63]:

    
fig = plt.figure(figsize=(5, 15))
plt.imshow(dtmap, origin='lower') # display H as an image
plt.yticks(range(dtmap.shape[0])[::4], sorted(times)[::4])
plt.xticks(range(7), [days_legend[x] for x in range(7)], rotation=60)
plt.colorbar()
plt.show()

There - we have a highly informative quick glance at the data. The most activity occurs in the late evenings. The weekends are pushed back a few hours from week days. And so on.

	file	length	features	period	periods_vary	min	max	mean	std	25th_percentile	50th_percentile	75th_percentile
0	art_daily_small_noise	4032	1	5m	1	18	87.98	42.44	28.08	19.84	21.61	75.45
1	art_daily_perfect_square_wave	4032	1	5m	1	20	80.00	42.50	29.05	20.00	20.00	80.00
2	art_daily_no_noise	4032	1	5m	1	20	80.00	42.50	27.95	20.00	20.29	79.62
3	art_noisy	4032	1	5m	1	8	19.00	13.49	3.15	10.79	13.45	16.23
4	art_flatline	4032	1	5m	1	45	45.00	45.00	0.00	45.00	45.00	45.00

	value
timestamp
2014-07-01 00:00:00	10844
2014-07-01 00:30:00	8127
2014-07-01 01:00:00	6210
2014-07-01 01:30:00	4656
2014-07-01 02:00:00	3820

	0	1	2	3	4	5	6
00:00:00	9036.700000	9636.129032	12001.451613	14041.774194	17392.387097	23733.612903	24564.133333
00:30:00	7003.866667	7331.290323	9087.870968	11355.225806	14489.709677	22252.516129	23233.333333
01:00:00	5358.866667	5740.354839	7118.903226	9082.193548	11836.870968	20739.806452	22554.300000
01:30:00	4140.633333	4316.032258	5426.354839	7182.935484	9350.290323	18836.677419	20632.766667
02:00:00	3486.000000	3440.032258	4418.000000	5941.387097	7673.354839	17406.677419	18635.933333