Generate example dataset

Using our favour source, Chicago: https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2

Geometry from https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Community-Areas-current-/cauq-8yn6



In [1]:

    
import os, csv, lzma
import numpy as np
import open_cp.sources.chicago
import geopandas as gpd
import pyproj
import shapely.geometry

Get our favourite, the southside



In [2]:

    
#datadir = os.path.join("/media", "OTHERDATA")
datadir = os.path.join("..", "..", "..", "..", "Data")
open_cp.sources.chicago.set_data_directory(datadir)
polygon = open_cp.sources.chicago.get_side("South")



In [3]:

    
frame = gpd.GeoDataFrame({"name":["South Side"]})
frame.geometry = [polygon]
frame.crs = {"init":"epsg:2790"}
frame









    Out[3]:






  
    
      
      name
      geometry
    
  
  
    
      0
      South Side
      POLYGON ((365647.3845872784 565208.9811670227,...



In [4]:

    
frame.to_file("SouthSide")

Process the data



In [6]:

    
def gen():
    filename = os.path.join(datadir, "chicago_all_dec2017.csv.xz")
    with lzma.open(filename, "rt") as f:
        yield from csv.reader(f)
        
rows = gen()
print(next(rows))
print(next(rows))









    



['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']
['4647369', 'HM155213', '01/31/2006 12:13:05 PM', '066XX N BOSWORTH AVE', '1811', 'NARCOTICS', 'POSS: CANNABIS 30GMS OR LESS', 'SCHOOL, PUBLIC, BUILDING', 'true', 'false', '2432', '024', '40', '1', '18', '1164737', '1944193', '2006', '04/15/2016 08:55:02 AM', '42.002478396', '-87.66929687', '(42.002478396, -87.66929687)']



In [ ]:

    
proj = pyproj.Proj({"init":"epsg:2790"})
rows = gen()
header = next(rows)
choices = []
for row in rows:
    if row[19] is "":
        continue
    if row[2][6:10] != "2016":
        continue
    x, y = proj(float(row[20]), float(row[19]))
    pt = shapely.geometry.Point(x, y)
    if polygon.intersects(pt):
        choices.append(row)



In [ ]:

    
want = np.sort(np.random.choice(len(choices), 1000, replace=False))
row = next(gen())
out = []
out.append([row[1], row[2], row[3], row[5], row[19], row[20]])
for i, row in enumerate(choices):
    if i in want:
        out.append([row[1], row[2], row[3], row[5], row[19], row[20]])



In [ ]:

    
with open("example.csv", "w", newline="") as f:
    csv.writer(f).writerows(out)



In [ ]: