inspiration from https://districtdatalabs.silvrback.com/time-maps-visualizing-discrete-events-across-many-timescales
In [1]:
import os; os.sys.path.append(os.path.dirname(os.path.abspath('.'))) # for relative imports
from utils.nab_data import NABData
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
data = NABData()
data.summary().head()
Out[4]:
In [7]:
data.data.keys()
Out[7]:
In [8]:
data.plot('nyc_taxi')
the example in the blog post is for a heat map of the amount of time in between events in a sequence.
In [11]:
import scipy.ndimage as ndi
df = data['nyc_taxi']
times = df.index
# calculate time differences:
diffs = np.diff(times)
xcoords = diffs[:-1] # all differences except the last
ycoords = diffs[1:] # all differences except the first
Nside=256 # this is the number of bins along x and y for the histogram
width=8 # the width of the Gaussian function along x and y when applying the blur operation
H = np.zeros((Nside,Nside)) # a 'histogram' matrix that counts the number of points in each grid-square
max_diff = np.max(diffs) # maximum time difference
x_heat = (Nside-1)*xcoords/max_diff # the xy coordinates scaled to the size of the matrix
y_heat = (Nside-1)*ycoords/max_diff # subtract 1 since Python starts counting at 0, unlike Fortran and R
for i in range(len(xcoords)): # loop over all points to calculate the population of each bin
H[x_heat[i], y_heat[i]] += 1 # Increase count by 1
#here, the integer part of x/y_heat[i] is automatically taken
H = ndi.gaussian_filter(H,width) # apply Gaussian blur
H = np.transpose(H) # so that the orientation is the same as the scatter plot
plt.imshow(H, origin='lower') # display H as an image
plt.show()
Our data file includes equal intervals, so this approach is worthless
In [13]:
len(set(np.diff(data['nyc_taxi'].index)))
Out[13]:
However, perhaps the axes could reflect time of day and day of week to show mean values for those instead.
In [14]:
df.head()
Out[14]:
This data file is every 30 mins...
In [15]:
df.index[0].weekday()
Out[15]:
In [24]:
np.unique(df.index.map(lambda x: x.weekday()))
Out[24]:
In [18]:
days_legend = dict(zip(range(7), ['Mon','Tue','Wed','Thur','Fri','Sat','Sun']))
days_legend
Out[18]:
In [19]:
day_masks = dict(zip(range(7), [df.index.map(lambda x: x.weekday() == d) for d in range(7)]))
df.loc[day_masks[0]].shape
Out[19]:
In [29]:
times = np.unique(df.index.map(lambda x: x.time()))
In [30]:
tindex = df.index.map(lambda x: x.time())
time_masks = dict(zip(times, [(tindex == t) for t in times]))
k = time_masks.keys()[0]
print k
print df.loc[time_masks[k]].shape
In [61]:
dtmap = pd.DataFrame(np.zeros((len(time_masks), len(day_masks))), index=sorted(times), columns=range(7))
print dtmap.shape
fn = np.mean
for day, daymask in day_masks.iteritems():
for time, timemask in time_masks.iteritems():
val = fn(df.loc[daymask & timemask])
dtmap.loc[time, day] = val.values
dtmap.head()
Out[61]:
In [63]:
fig = plt.figure(figsize=(5, 15))
plt.imshow(dtmap, origin='lower') # display H as an image
plt.yticks(range(dtmap.shape[0])[::4], sorted(times)[::4])
plt.xticks(range(7), [days_legend[x] for x in range(7)], rotation=60)
plt.colorbar()
plt.show()
There - we have a highly informative quick glance at the data. The most activity occurs in the late evenings. The weekends are pushed back a few hours from week days. And so on.