In [1]:
# Allow us to load `open_cp` without installing
import sys, os.path
sys.path.insert(0, os.path.abspath(".."))
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import open_cp
CSV file available from https://catalog.data.gov/dataset/crimes-one-year-prior-to-present-e171f
In [2]:
import open_cp.sources.chicago as chicago
points = chicago.default_burglary_data()
points
Out[2]:
In [3]:
len(points.timestamps), points.time_range
Out[3]:
In [4]:
bbox = points.bounding_box
print("X coord range:", bbox.xmin, bbox.xmax)
print("Y coord range:", bbox.ymin, bbox.ymax)
print(bbox.aspect_ratio)
In [5]:
_, ax = plt.subplots(figsize=(10,10 * bbox.aspect_ratio))
ax.scatter(points.coords[0], points.coords[1], alpha=0.1, marker="o", s=1)
Out[5]:
As an American city, most streets run North-South or East-West. Further, the data is geocoded to the centre of the "block", to anonymise the data. (Though this is slightly inconsistent, if one looks closely at the raw CSV file.)
In the plot above:
In [6]:
mask = ( (points.xcoords >= 355000) & (points.xcoords <= 365000) &
(points.ycoords >= 575000) & (points.ycoords <= 585000) )
downtown = points[mask]
bbox = downtown.bounding_box
print("X coord range:", bbox.xmin, bbox.xmax)
print("Y coord range:", bbox.ymin, bbox.ymax)
In [7]:
_, ax = plt.subplots(figsize=(5, 5 * bbox.aspect_ratio))
ax.scatter(downtown.coords[0], downtown.coords[1], alpha=0.1, marker="o", s=1)
Out[7]:
In [8]:
import open_cp.sources.ukpolice as ukpolice
points = ukpolice.default_burglary_data()
len(points.timestamps)
Out[8]:
In [9]:
bbox = points.bounding_box
fig, ax = plt.subplots(figsize=(10, 10 * bbox.aspect_ratio))
ax.scatter(points.xcoords, points.ycoords, s=10, alpha=0.2)
Out[9]:
These are longitude / latitude points, which distort distance. Assuming you have pyproj
installed, you can project. For the UK, we use British National Grid
In [10]:
import open_cp
In [11]:
projected_points = open_cp.data.points_from_lon_lat(points, epsg=7405)
In [12]:
bbox = projected_points.bounding_box
fig, ax = plt.subplots(figsize=(10, 10 * bbox.aspect_ratio))
ax.scatter(projected_points.xcoords, projected_points.ycoords, s=10, alpha=0.2)
Out[12]:
In [13]:
import open_cp.sources.random as random
import datetime
In [14]:
region = open_cp.RectangularRegion(390000, 450000, 410000, 450000)
points = random.random_uniform(region, datetime.date(2017,1,1), datetime.date(2017,3,1), 1000)
points.time_range
Out[14]:
In [15]:
bbox = points.bounding_box
fig, ax = plt.subplots(figsize=(10, 10 * bbox.aspect_ratio))
ax.scatter(*points.coords, s=10, alpha=0.2)
Out[15]:
If we have scipy installed, we can quickly use a 2D Gaussian kernel density estimation to get an estimate of the "risk intensity" from the real West Yorkshire data.
In [16]:
import scipy.stats
In [17]:
kernel = scipy.stats.gaussian_kde(projected_points.coords)
In [18]:
X, Y = np.mgrid[bbox.xmin:bbox.xmax:100j, bbox.ymin:bbox.ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
Z = np.reshape(kernel(positions), X.shape)
np.max(Z)
Out[18]:
In [19]:
plt.imshow(np.rot90(Z))
Out[19]:
In [20]:
sampler = random.KernelSampler(region, kernel, 4e-9)
points = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)
In [21]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(*projected_points.coords, s=10, alpha=0.2)
ax[1].scatter(*points.coords, s=10, alpha=0.2)
for i in [0, 1]:
ax[i].set_aspect(bbox.aspect_ratio)
ax[i].set(xlim=[bbox.xmin, bbox.xmax], ylim=[bbox.ymin, bbox.ymax])
ax[0].set_title("Real data, Jan 2017")
_ = ax[1].set_title("Gaussian KDE sample")
The real plot still looks somewhat different to the random test data, suggesting that a simple fixed bandwidth Gaussian KDE is not appropriate (which we already knew...)
In [22]:
import open_cp.kernels
In [23]:
kernel = open_cp.kernels.kth_nearest_neighbour_gaussian_kde(projected_points.coords, k=10)
sampler = random.KernelSampler(region, kernel, 4e-9)
points10 = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)
kernel = open_cp.kernels.kth_nearest_neighbour_gaussian_kde(projected_points.coords, k=25)
sampler = random.KernelSampler(region, kernel, 4e-9)
points25 = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)
kernel = open_cp.kernels.kth_nearest_neighbour_gaussian_kde(projected_points.coords, k=50)
sampler = random.KernelSampler(region, kernel, 4e-9)
points50 = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)
In [24]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 9))
ax[0,0].scatter(*projected_points.coords, s=10, alpha=0.2)
ax[0,1].scatter(*points10.coords, s=10, alpha=0.2)
ax[1,0].scatter(*points25.coords, s=10, alpha=0.2)
ax[1,1].scatter(*points50.coords, s=10, alpha=0.2)
for a in ax.ravel():
a.set_aspect(bbox.aspect_ratio)
a.set(xlim=[bbox.xmin, bbox.xmax], ylim=[bbox.ymin, bbox.ymax])
ax[0,0].set_title("Real data, Jan 2017")
ax[0,1].set_title("k=10 nearest neighbour sample")
ax[1,0].set_title("k=25 nearest neighbour sample")
ax[1,1].set_title("k=50 nearest neighbour sample")
fig.tight_layout()
None
Visually, having a rather narrow bandwidth seems to look better.
I suspect that to produce more realistic simulations, to geography of the data needs to be investigated: i.e. locate the points onto buildings and into the real street network.
In [25]:
import open_cp.sources.sepp as sepp
In [26]:
region = open_cp.RectangularRegion(0,100,0,100)
kernel = sepp.PoissonTimeGaussianSpace(1, [50, 50], [150, 25], 0.8)
sampler = sepp.InhomogeneousPoisson(region, kernel)
In [27]:
points = sampler.sample(0, 100)
In [28]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(points[1], points[2])
ax[0].set_title("Space location")
ax[0].set_aspect(1)
ax[0].set_xlim(0,100)
ax[0].set_ylim(0,100)
ax[1].scatter(points[0], points[1])
ax[1].set_xlabel("time")
ax[1].set_ylabel("x coord")
ax[1].set_title("X location against time")
None
The coordinates in space give samples from a 2D correlated Gaussian distribution, as we expect.
If we do this repeatedly, then the time coordinates along should give a poisson process.
In [29]:
counts = []
window = []
for _ in range(10000):
times = sampler.sample(0,100)[0]
counts.append(len(times))
window.append(np.sum(times <= 20))
fig, ax = plt.subplots(ncols=2, figsize=(16, 4))
ax[0].hist(counts)
ax[0].set_title("Number of points")
ax[1].hist(window)
ax[1].set_title("In window [0,20]")
None
If the intensity function of the poisson process has the form $\lambda(t,x,y) = \nu(t)\mu(x,y)$ then we can simulate the time-only Poission process with density $\nu$, and then sample the space dimension as if it were a "mark" (see the notion of a "marked Poisson process" in the literature). If $\mu$ is a probability density of a standard type, this is much faster, because we can very easily draw samples for the space dimensions.
In [30]:
time_kernel = sepp.Exponential(exp_rate=1, total_rate=10)
space_sampler = sepp.GaussianSpaceSampler([50, 50], [150, 25], 0.8)
sampler = sepp.InhomogeneousPoissonFactors(time_kernel, space_sampler)
In [31]:
points = sampler.sample(0, 100)
fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(points[1], points[2])
ax[0].set_title("Space location")
ax[0].set_aspect(1)
ax[0].set_xlim(0,100)
ax[0].set_ylim(0,100)
ax[1].scatter(points[0], points[1])
ax[1].set_xlabel("time")
ax[1].set_ylabel("x coord")
ax[1].set_title("X location against time")
None
You need to pass two intensity functions (aka kernels), one for the background events, and one for the triggered events.
In the following example, the background sampler has as time component a constant rate poisson process, and a Gaussian space density, centred at (50,50).
The trigger kernel has an exponential density in time (so on average each event triggers one further event) and a space kernel which is deliberate biases to jump around 5 units in the x direction. We can hence visualise the cascade of triggered events as a rightward drift on the first graph, and an upward drift on the second graph.
In [32]:
background_sampler = sepp.InhomogeneousPoissonFactors(sepp.HomogeneousPoisson(1),
sepp.GaussianSpaceSampler([50,50], [50,50], 0))
time_kernel = sepp.Exponential(exp_rate=1, total_rate=1)
space_sampler = sepp.GaussianSpaceSampler([5, 0], [1, 1], 0)
trigger_sampler = sepp.InhomogeneousPoissonFactors(time_kernel, space_sampler)
sampler = sepp.SelfExcitingPointProcess(background_sampler, trigger_sampler)
In [33]:
points = sampler.sample(0,10)
In [34]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(points[1], points[2])
ax[0].set_title("Space location")
ax[0].set_aspect(1)
ax[0].set_xlim(0,100)
ax[0].set_ylim(0,100)
ax[1].scatter(points[0], points[1])
ax[1].set_xlabel("time")
ax[1].set_ylabel("x coord")
ax[1].set_title("X location against time")
None
In [ ]: