Optionally change the spatial and temporal window:
In [1]:
%matplotlib inline
from common import *
#datadir = os.path.join("//media", "disk", "Data")
datadir = os.path.join("..", "..", "..", "..", "..", "Data")
south_side, points = load_data(datadir)
grid = grid_for_south_side()
In [2]:
import open_cp.stscan as stscan
import open_cp.stscan2 as stscan2
In [3]:
trainer = stscan.STSTrainer()
trainer.region = grid.region()
trainer.data = points
In [4]:
scanner, _ = trainer.to_scanner()
scanner.coords.shape, scanner.timestamps.shape
Out[4]:
In [5]:
# Check how we covert the data
last_time = max(trainer.data.timestamps)
x = (last_time - trainer.data.timestamps) / np.timedelta64(1,"ms")
indexes = np.argsort(x)
np.testing.assert_allclose(x[indexes], scanner.timestamps)
np.testing.assert_allclose(trainer.data.coords[:,indexes], scanner.coords)
In [6]:
ts = scanner.timestamps / 1000 / 60
ts = ts[:100]
c = scanner.coords[:,:100]
stscan2.AbstractSTScan.write_to_satscan("temp", max(ts), c, ts)
max(max(ts) - ts)
Out[6]:
In [7]:
scanner.timestamps = scanner.timestamps[:100]
scanner.coords = scanner.coords[:,:100]
list(scanner.find_all_clusters())
Out[7]:
In [23]:
trainer1 = trainer.bin_timestamps(np.datetime64("2017-01-01"), np.timedelta64(1, "D"))
trainer1.data.number_data_points, trainer1.data.time_range
Out[23]:
In [24]:
trainer1.to_satscan("test")
In [25]:
result = trainer1.predict()
result.statistics[:5]
Out[25]:
In [15]:
trainer1 = trainer.grid_coords(grid.region(), grid.xsize)
trainer1 = trainer1.bin_timestamps(np.datetime64("2017-01-01"), np.timedelta64(1, "D"))
#trainer1.data = trainer1.data[trainer1.data.timestamps < np.datetime64("2011-04-01")]
trainer1.data.number_data_points, trainer1.data.time_range
Out[15]:
In [16]:
trainer1.to_satscan("test")
In [17]:
result = trainer1.predict()
In [20]:
result.statistics[:5]
Out[20]:
In [19]:
result = trainer.predict()
In [20]:
result.clusters[0]
Out[20]:
In [35]:
pred = result.grid_prediction(grid.xsize)
pred.mask_with(grid)
In [42]:
import matplotlib.patches
def plot_clusters(ax, coords, clusters):
xmax, xmin = np.max(scanner.coords[0]), np.min(scanner.coords[0])
xd = (xmax - xmin) / 100 * 5
ax.set(xlim=[xmin-xd, xmax+xd])
ymax, ymin = np.max(scanner.coords[1]), np.min(scanner.coords[1])
yd = (ymax - ymin) / 100 * 5
ax.set(ylim=[ymin-yd, ymax+yd])
ax.set_aspect(1)
for c in clusters:
cir = matplotlib.patches.Circle(c.centre, c.radius, alpha=0.5)
ax.add_patch(cir)
ax.scatter(*coords, color="black", marker="+", linewidth=1)
def plot_grid_pred(ax, pred):
cmap = ax.pcolormesh(*pred.mesh_data(), pred.intensity_matrix, cmap=yellow_to_red)
fig.colorbar(cmap, ax=ax)
ax.set_aspect(1)
ax.add_patch(descartes.PolygonPatch(south_side, fc="none", ec="Black"))
fig, axes = plt.subplots(ncols=2, figsize=(17,8))
plot_clusters(axes[0], trainer.data.coords, result.clusters)
plot_grid_pred(axes[1], pred)
In [28]:
trainer1 = trainer.grid_coords(grid.region(), grid.xsize)
trainer1 = trainer1.bin_timestamps(np.datetime64("2017-01-01"), np.timedelta64(1, "D"))
trainer1.region = grid.region()
In [30]:
result1 = trainer1.predict()
In [31]:
result1.clusters[:10]
Out[31]:
In [32]:
pred1 = result1.grid_prediction(grid.xsize)
pred1.mask_with(grid)
In [43]:
fig, axes = plt.subplots(ncols=2, figsize=(17,8))
plot_clusters(axes[0], trainer1.data.coords, result1.clusters)
plot_grid_pred(axes[1], pred1)
As you will see above, some of the clusters returned have zero radius (which makes sense, as having assigned all events to the middle of the grid cell they fall into, there will be clusters just consisting of the events in one grid cell).
Instead, it is possible to use the library to replace each cluster by the cluster with the same centre but with a radius enlarged to the maximum extent possible so that it contains no more events.
In [37]:
pred2 = result1.grid_prediction(grid.xsize, use_maximal_clusters=True)
pred2.mask_with(grid)
In [44]:
fig, axes = plt.subplots(ncols=2, figsize=(17,8))
plot_clusters(axes[0], trainer1.data.coords, result1.max_clusters)
plot_grid_pred(axes[1], pred2)
In [ ]:
time_masks, time_counts, times = scanner.make_time_ranges()
N = scanner.timestamps.shape[0]
centre = scanner.coords.T[0]
space_masks, space_counts, dists = scanner.find_discs(centre)
actual = scanner._calc_actual(space_masks, time_masks, time_counts)
expected = space_counts[:,None] * time_counts[None,:] / N
_mask = (actual > 1) & (actual > expected)
stats = scanner._ma_statistic(np.ma.array(actual, mask=~_mask),
np.ma.array(expected, mask=~_mask), N)
_mask1 = np.any(_mask, axis=1)
if not np.any(_mask1):
raise Exception()
m = np.ma.argmax(stats, axis=1)[_mask1]
stats1 = stats[_mask1,:]
stats2 = stats1[range(stats1.shape[0]),m].data
used_dists = dists[_mask1]
used_times = times[m]
In [ ]:
%timeit( scanner.find_discs(centre) )
In [ ]:
%timeit( np.sum(space_masks[:,:,None] & time_masks[:,None,:], axis=0) )
In [ ]:
%timeit(scanner._calc_actual(space_masks, time_masks, time_counts))
In [ ]:
np.testing.assert_allclose(scanner._calc_actual(space_masks, time_masks, time_counts),
np.sum(space_masks[:,:,None] & time_masks[:,None,:], axis=0))
In [ ]:
%timeit(space_counts[:,None] * time_counts[None,:] / N)
In [ ]:
%timeit((actual > 1) & (actual > expected))
In [ ]:
%timeit(scanner._ma_statistic(np.ma.array(actual, mask=~_mask), np.ma.array(expected, mask=~_mask), N))
In [ ]:
In [ ]:
log_lookup = np.log(np.array([1] + list(range(1,N+1))))
log_lookup2 = np.log(np.array([1] + list(range(1,N*N+1))))
In [ ]:
sh = (space_counts.shape[0], time_counts.shape[0])
s = np.ma.array(np.broadcast_to(space_counts[:,None], sh), mask=~_mask)
t = np.ma.array(np.broadcast_to(time_counts[None,:], sh), mask=~_mask)
a = np.ma.array(actual, mask=~_mask)
e = np.ma.array(s*t, mask=~_mask) / N
x1 = a * np.ma.log(a/e)
Nl = np.log(N)
aa = a.astype(np.int)
y1 = a * (Nl + log_lookup[aa] - log_lookup[s] - log_lookup[t])
assert np.ma.max(np.ma.abs(x1-y1)) < 1e-10
x2 = (N-a) * (np.ma.log(N-a) - np.ma.log(N-e))
y2 = (N-a) * (Nl + log_lookup[N-aa] - np.ma.log(N*N-s*t))
assert np.ma.max(np.ma.abs(x2-y2)) < 1e-10
In [ ]:
aa = actual.astype(np.int)
def f():
sl = log_lookup[space_counts]
tl = log_lookup[time_counts]
st = N*N - space_counts[:,None] * time_counts[None,:]
Nl = np.log(N)
y = aa * (Nl + log_lookup[aa] - sl[:,None] - tl[None,:])
yy = (N-aa) * (Nl + log_lookup[N-aa] - log_lookup2[st])
return np.ma.array(y + yy, mask=~_mask)
stats = scanner._ma_statistic(np.ma.array(actual, mask=~_mask),
np.ma.array(expected, mask=~_mask), N)
np.ma.max(np.ma.abs(stats - f()))
In [ ]:
%timeit(f())
In [ ]:
In [ ]:
%timeit(np.any(_mask, axis=1))
In [ ]:
%timeit(np.ma.argmax(stats, axis=1)[_mask1])
In [ ]:
%timeit(stats[_mask1,:])
In [ ]:
%timeit(stats1[range(stats1.shape[0]),m].data)
In [ ]:
%timeit(dists[_mask1])
In [ ]:
%timeit(times[m])
In [ ]:
def f():
x = scanner.faster_score_all()
return next(x)
def f1():
x = scanner.faster_score_all_new()
return next(x)
a = f()
a1 = f1()
for i in range(4):
np.testing.assert_allclose(a[i], a1[i])
In [ ]:
# Compare against the old slow method
def find_chunk(ar, start_index):
x = (ar == ar[start_index])
end_index = start_index
while end_index < len(ar) and x[end_index]:
end_index += 1
return end_index
x = scanner.faster_score_all_old()
a2 = next(x)
start_index = 0
for index in range(len(a1[1])):
end_index = find_chunk(a2[1], start_index)
i = np.argmax(a2[3][start_index:end_index])
for j in range(1,4):
assert abs(a2[j][start_index+i] - a1[j][index]) < 1e-10
start_index = end_index
In [ ]:
%timeit(f())
In [ ]:
%timeit(f1())
In [ ]:
import datetime
x = scanner.faster_score_all()
for _ in range(20):
now = datetime.datetime.now()
next(x)
print(datetime.datetime.now() - now)
In [ ]:
In [ ]:
next(scanner.find_all_clusters())
In [ ]: