In [1]:
# Allow us to load `open_cp` without installing
import sys, os.path
sys.path.insert(0, os.path.abspath(os.path.join("..", "..")))
The data can be downloaded from https://catalog.data.gov/dataset/crimes-2001-to-present-398a4 (see the module docstring of open_cp.sources.chicago See also https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2
The total data sets (for all crime events 2001 onwards) give different files between these two sources. We check that they do contain the same data.
In [2]:
import sys, os, csv, lzma
import open_cp.sources.chicago as chicago
filename = os.path.join("..", "..", "open_cp", "sources", "chicago.csv")
filename1 = os.path.join("..", "..", "open_cp", "sources", "chicago1.csv")
filename_all = os.path.join("..", "..", "open_cp", "sources", "chicago_all.csv.xz")
filename_all1 = os.path.join("..", "..", "open_cp", "sources", "chicago_all1.csv.xz")
The files filename and filename1 were downloaded from, respectively, the US Gov website, and the Chicago site. They are slightly different in size, but appear to contain the same data. (This can be checked!)
The files filename_all and filename_all1 were also downloaded from, respectively, the US Gov website, and the Chicago site. While they are the same size (uncompressed), and have the same headers, the data appears, at least naively, to be different.
In [3]:
with lzma.open(filename_all, "rt") as file:
print(next(file))
with lzma.open(filename_all1, "rt") as file:
print(next(file))
In [4]:
with lzma.open(filename_all, "rt") as file:
next(file); print(next(file))
with lzma.open(filename_all1, "rt") as file:
next(file); print(next(file))
In [5]:
# NB: These methods encode a missing geometry and (-1, -1)
def yield_tuples(f):
for feature in chicago.generate_GeoJSON_Features(f, type="all"):
props = feature["properties"]
if props["crime"] == "HOMICIDE":
continue
coords = feature["geometry"]
if coords is None:
coords = (-1, -1)
else:
coords = coords["coordinates"]
event = (props["case"], props["crime"], props["type"], props["location"],
props["timestamp"], props["address"], coords[0], coords[1])
yield event
def load_as_tuples(f):
events = list(yield_tuples(f))
def load_as_dict_to_lists(f):
events = dict()
for event in yield_tuples(f):
case = event[0]
if case not in events:
events[case] = []
events[case].append(event[1:])
return events
In [6]:
def compare_one_other(file1, file2):
in_only1 = []
in_only2 = []
with lzma.open(file1, "rt") as file:
events = load_as_dict_to_lists(file)
with lzma.open(file2, "rt") as file:
for event in yield_tuples(file):
case, e = event[0], event[1:]
if case not in events or e not in events[case]:
in_only2.append(event)
continue
events[case].remove(e)
if len(events[case]) == 0:
del events[case]
for case, e in events.items():
in_only1.append( (case,) + e )
return in_only1, in_only2
In [7]:
compare_one_other(filename_all, filename_all1)
Out[7]:
In [38]:
import pyproj, numpy
proj = pyproj.Proj({'init': 'epsg:3435'}, preserve_units=True)
def check_file(file):
reader = csv.reader(file)
header = next(reader)
assert header[15] == "X Coordinate"
assert header[16] == "Y Coordinate"
assert header[19] == "Latitude"
assert header[20] == "Longitude"
assert header[21] == "Location"
for row in reader:
x, y = row[15], row[16]
lat, lon, latlon = row[19], row[20], row[21]
if x == "":
assert y == ""
assert lat == ""
assert lon == ""
assert latlon == ""
else:
assert latlon == "(" + lat + ", " + lon + ")"
xx, yy = proj(float(lon), float(lat))
assert int(x) == numpy.round(xx)
assert int(y) == numpy.round(yy)
In [39]:
with lzma.open(filename_all, "rt") as file:
check_file(file)
In [40]:
with lzma.open(filename_all1, "rt") as file:
check_file(file)
In [9]:
with lzma.open(filename_all, "rt") as file:
all_events = load_as_dict_to_lists(file)
In [10]:
frame = chicago.load_to_geoDataFrame()
frame.head()
Out[10]:
In [11]:
known_diffs = {"JA233208", "JA228951", "JA249656", "JA256373", "JA256594", "JA256838"}
not_found = []
for index, row in frame.iterrows():
if row.crime == "HOMICIDE":
continue
if row.case in known_diffs:
continue
if row.case not in all_events:
not_found.append(row.case)
continue
event = all_events[row.case]
if len(event) > 1:
print("Doubled, skipping:", row.case)
continue
event = event[0]
assert(row.address == event[4])
assert(row.crime == event[0])
assert(row.location == event[2])
assert(row.timestamp == event[3])
assert(row.type == event[1])
if row.geometry is not None:
assert(row.geometry.coords[0][0] == event[5])
assert(row.geometry.coords[0][1] == event[6])
In [12]:
not_found
Out[12]:
In [14]:
frame[frame.case.map(lambda x : x in known_diffs)]
Out[14]:
In [ ]: