In [176]:
"""
0.1-first-pass-eda.ipynb
The very first pass through, examining the data,
looking around an initial set of data,
while trying to keep it DRY and programmatic,
and reproducable.
RESULT:
There is something broken or misunderstood about how the script below is using offset.
It needs to be figured out in order to programmatically access the data available
on data.sfgov.org in an accurate and effective manner.
"""
from __future__ import division, print_function # For Python3 future-proofing
import os
import pandas as pd
import numpy as np
In [177]:
# GOTCHYA: be explicit about the API call to ensure the ordering stays the same, and add an ordering to it
df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?$limit=1000&$offset=0&$order=received_dttm')
In [178]:
df.head()
Out[178]:
In [179]:
df.shape # hmm... so by default the json call is only providing me with 1,000 rows; I'll have to fix that
Out[179]:
In [180]:
df.info() # we need to potentially fix all of these "object" types to be something specific
In [181]:
df.describe(include=['O']) # let's look at the types of object data
Out[181]:
In [182]:
print(df.call_date.min(), df.call_date.max()) # this shows the min and max values in terms of dates. One day!
In [183]:
for c in df.columns:
print(c)
In [184]:
# What is this fire prevention district about?
df.fire_prevention_district.value_counts(dropna=False)
Out[184]:
In [185]:
df.call_type_group.value_counts(dropna=False) # curious to see what the call types are
Out[185]:
In [186]:
df.call_final_disposition.value_counts(dropna=False)
Out[186]:
In [187]:
# now let's try to grab a different url based on the API docs for S
# https://dev.socrata.com/docs/filtering.html
# df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?call_final_disposition=Fire&$limit=1000&$offset=0&$order=:id')
df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?call_final_disposition=Fire&$limit=1000&$offset=0&$order=received_dttm')
In [188]:
df.call_type_group.value_counts(dropna=False)
Out[188]:
In [189]:
df.call_final_disposition.value_counts(dropna=False)
Out[189]:
In [190]:
# confirm we have different timestamps than before
print(df.call_date.min(), df.call_date.max()) # a little bit of a bigger window, nice!
In [191]:
# now we'll just experiment with the offset to see what we get, preferably a second page of results
df2 = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?$order=received_dttm&call_final_disposition=Fire&$offset=5')
# https://data.sfgov.org/resource/enhu-st7v.json
In [192]:
print(df2.received_dttm.min(), df2.received_dttm.max()) # a little bit of a bigger window, nice!
# by offset:
# 0 2016-01-01T00:03:02 2016-01-04T15:36:16
# 1 2016-01-01T00:03:02 2016-01-04T15:36:16
# 2 2016-01-01T00:03:02 2016-01-04T15:44:38
# 3 2016-01-01T00:35:02 2016-01-04T15:53:36
# 4 2016-01-01T00:35:02 2016-01-04T15:53:36
In [193]:
# hmmm... it appears I'm missing something where the correct data is not getting returned
# I'll want to open a ticket with Socrates regarding pagination
# in the meantime let me try a different url for access the data to see if that improves things:
# sorted_url = "https://data.sfgov.org/Public-Safety/FD-CFS-sorted-by-earliest-received-date/sg7s-kczx"
df = pd.read_json("https://data.sfgov.org/resource/sg7s-kczx.json?$order=received_dttm&$offset=0")
In [194]:
print(df.received_dttm.min(), df.received_dttm.max()) # a little bit of a bigger window, nice!
In [195]:
# actually, I have here a set of data that is more fire-specific, that I think is better to use
# but it's not sorted
# https://data.sfgov.org/resource/wbb6-uh78.json
df = pd.read_json('https://data.sfgov.org/resource/wbb6-uh78.json?$order=incident_number&$offset=0')
In [196]:
print(df.incident_number.min())
print(df.incident_number.max())
# offset
# 0 3000001 3003135
# 1 3000003 3003136
# 2 3000006 3003139
# 3 3000007 3003143
In [197]:
print(df.alarm_dttm.min(), df.alarm_dttm.max()) # a little bit of a bigger window, nice!
# 0 2003-01-07T02:55:22.000 2015-06-20T02:44:56.000
# 1 2003-01-01T00:07:32.000 2003-01-11T07:49:30.000
# 200 2003-01-02T18:28:53.000 2003-01-13T12:05:02.000
In [ ]: