In [176]:
"""
0.1-first-pass-eda.ipynb
The very first pass through, examining the data,
looking around an initial set of data,
while trying to keep it DRY and programmatic,
and reproducable.

RESULT: 
There is something broken or misunderstood about how the script below is using offset.
It needs to be figured out in order to programmatically access the data available 
on data.sfgov.org in an accurate and effective manner.
"""
from __future__ import division, print_function # For Python3 future-proofing
import os
import pandas as pd
import numpy as np

In [177]:
# GOTCHYA: be explicit about the API call to ensure the ordering stays the same, and add an ordering to it
df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?$limit=1000&$offset=0&$order=received_dttm')

In [178]:
df.head()


Out[178]:
address als_unit available_dttm battalion box call_date call_final_disposition call_number call_type_group city ... response_dttm rowid station_area supervisor_district transport_dttm unit_id unit_sequence_in_call_dispatch unit_type watch_date zipcode_of_incident
0 600 Block of LAGUNA ST False 2016-01-01T01:17:47 B02 3413 2016-01-01T00:00:00 Code 2 Transport 160010009 Potentially Life-Threatening San Francisco ... 2016-01-01T00:10:30 160010009-KM11 36 5 2016-01-01T00:42:14 KM11 1 PRIVATE 2015-12-31T00:00:00 94102
1 600 Block of LAGUNA ST False 2016-01-01T00:15:00 B02 3413 2016-01-01T00:00:00 Code 2 Transport 160010009 Potentially Life-Threatening San Francisco ... 2016-01-01T00:05:22 160010009-KM06 36 5 2016-04-25T12:54:45 KM06 2 PRIVATE 2015-12-31T00:00:00 94102
2 PRECITA AV/FLORIDA ST True 2016-01-01T00:12:51 B06 5621 2016-01-01T00:00:00 No Merit 160010015 Fire San Francisco ... 2016-01-01T00:05:52 160010015-E09 9 9 2016-04-25T12:54:45 E09 1 ENGINE 2015-12-31T00:00:00 94110
3 1600 Block of EUCALYPTUS DR False 2016-01-01T00:15:04 B08 8815 2016-01-01T00:00:00 Fire 160010018 Alarm San Francisco ... 2016-01-01T00:07:06 160010018-B08 19 7 2016-04-25T12:54:45 B08 3 CHIEF 2015-12-31T00:00:00 94132
4 1600 Block of EUCALYPTUS DR True 2016-01-01T00:15:04 B08 8815 2016-01-01T00:00:00 Fire 160010018 Alarm San Francisco ... 2016-01-01T00:06:12 160010018-E18 19 7 2016-04-25T12:54:45 E18 2 ENGINE 2015-12-31T00:00:00 94132

5 rows × 33 columns


In [179]:
df.shape  # hmm... so by default the json call is only providing me with 1,000 rows; I'll have to fix that


Out[179]:
(1000, 33)

In [180]:
df.info()  # we need to potentially fix all of these "object" types to be something specific


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 33 columns):
address                           1000 non-null object
als_unit                          1000 non-null bool
available_dttm                    1000 non-null object
battalion                         1000 non-null object
box                               1000 non-null int64
call_date                         1000 non-null object
call_final_disposition            1000 non-null object
call_number                       1000 non-null int64
call_type_group                   998 non-null object
city                              1000 non-null object
dispatch_dttm                     1000 non-null object
entry_dttm                        1000 non-null object
final_priority                    1000 non-null int64
fire_prevention_district          1000 non-null object
hospital_dttm                     1000 non-null object
incident_number                   1000 non-null int64
location                          1000 non-null object
neighborhood_district             1000 non-null object
number_of_alarms                  1000 non-null int64
on_scene_dttm                     1000 non-null object
original_priority                 1000 non-null object
priority                          1000 non-null object
received_dttm                     1000 non-null object
response_dttm                     1000 non-null object
rowid                             1000 non-null object
station_area                      1000 non-null int64
supervisor_district               1000 non-null int64
transport_dttm                    1000 non-null object
unit_id                           1000 non-null object
unit_sequence_in_call_dispatch    1000 non-null int64
unit_type                         1000 non-null object
watch_date                        1000 non-null object
zipcode_of_incident               1000 non-null int64
dtypes: bool(1), int64(9), object(23)
memory usage: 258.8+ KB

In [181]:
df.describe(include=['O'])  # let's look at the types of object data


Out[181]:
address available_dttm battalion call_date call_final_disposition call_type_group city dispatch_dttm entry_dttm fire_prevention_district ... on_scene_dttm original_priority priority received_dttm response_dttm rowid transport_dttm unit_id unit_type watch_date
count 1000 1000 1000 1000 1000 998 1000 1000 1000 1000 ... 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000
unique 399 940 11 1 12 4 3 591 448 11 ... 772 5 4 446 952 1000 242 156 9 2
top 5700 Block of MISSION ST 2016-01-01T16:29:55 B03 2016-01-01T00:00:00 Code 2 Transport Potentially Life-Threatening San Francisco 2016-01-01T14:21:41 2016-01-01T16:27:41 1 ... 2016-04-25T12:54:45 3 3 2016-01-01T16:26:12 2016-04-25T12:54:45 160011278-73 2016-04-25T12:54:45 E03 ENGINE 2015-12-31T00:00:00
freq 20 10 171 1000 486 445 990 13 20 167 ... 148 617 642 20 25 1 502 33 391 521

4 rows × 23 columns


In [182]:
print(df.call_date.min(), df.call_date.max())  # this shows the min and max values in terms of dates. One day!


2016-01-01T00:00:00 2016-01-01T00:00:00

In [183]:
for c in df.columns:
    print(c)


address
als_unit
available_dttm
battalion
box
call_date
call_final_disposition
call_number
call_type_group
city
dispatch_dttm
entry_dttm
final_priority
fire_prevention_district
hospital_dttm
incident_number
location
neighborhood_district
number_of_alarms
on_scene_dttm
original_priority
priority
received_dttm
response_dttm
rowid
station_area
supervisor_district
transport_dttm
unit_id
unit_sequence_in_call_dispatch
unit_type
watch_date
zipcode_of_incident

In [184]:
# What is this fire prevention district about?
df.fire_prevention_district.value_counts(dropna=False)


Out[184]:
1       167
3       158
2       144
5        96
7        92
6        86
4        75
9        70
10       51
8        47
None     14
Name: fire_prevention_district, dtype: int64

In [185]:
df.call_type_group.value_counts(dropna=False)  # curious to see what the call types are


Out[185]:
Potentially Life-Threatening    445
Non Life-threatening            270
Alarm                           212
Fire                             71
NaN                               2
Name: call_type_group, dtype: int64

In [186]:
df.call_final_disposition.value_counts(dropna=False)


Out[186]:
Code 2 Transport              486
Fire                          268
No Merit                       64
Patient Declined Transport     58
SFPD                           23
Cancelled                      22
Other                          18
Code 3 Transport               16
Medical Examiner               16
Unable to Locate               13
Against Medical Advice         12
Gone on Arrival                 4
Name: call_final_disposition, dtype: int64

In [187]:
# now let's try to grab a different url based on the API docs for S
# https://dev.socrata.com/docs/filtering.html
# df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?call_final_disposition=Fire&$limit=1000&$offset=0&$order=:id')
df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?call_final_disposition=Fire&$limit=1000&$offset=0&$order=received_dttm')

In [188]:
df.call_type_group.value_counts(dropna=False)


Out[188]:
Alarm                           769
Fire                            182
Non Life-threatening             34
Potentially Life-Threatening      9
NaN                               6
Name: call_type_group, dtype: int64

In [189]:
df.call_final_disposition.value_counts(dropna=False)


Out[189]:
Fire    1000
Name: call_final_disposition, dtype: int64

In [190]:
# confirm we have different timestamps than before
print(df.call_date.min(), df.call_date.max())  # a little bit of a bigger window, nice!


2016-01-01T00:00:00 2016-01-04T00:00:00

In [191]:
# now we'll just experiment with the offset to see what we get, preferably a second page of results
df2 = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?$order=received_dttm&call_final_disposition=Fire&$offset=5')
#                      https://data.sfgov.org/resource/enhu-st7v.json

In [192]:
print(df2.received_dttm.min(), df2.received_dttm.max())  # a little bit of a bigger window, nice!
# by offset:
# 0 2016-01-01T00:03:02 2016-01-04T15:36:16
# 1 2016-01-01T00:03:02 2016-01-04T15:36:16
# 2 2016-01-01T00:03:02 2016-01-04T15:44:38
# 3 2016-01-01T00:35:02 2016-01-04T15:53:36
# 4 2016-01-01T00:35:02 2016-01-04T15:53:36


2016-01-01T00:35:02 2016-01-04T15:53:36

In [193]:
# hmmm... it appears I'm missing something where the correct data is not getting returned
# I'll want to open a ticket with Socrates regarding pagination
# in the meantime let me try a different url for access the data to see if that improves things:
# sorted_url = "https://data.sfgov.org/Public-Safety/FD-CFS-sorted-by-earliest-received-date/sg7s-kczx"
df = pd.read_json("https://data.sfgov.org/resource/sg7s-kczx.json?$order=received_dttm&$offset=0")

In [194]:
print(df.received_dttm.min(), df.received_dttm.max())  # a little bit of a bigger window, nice!


2000-04-12T21:00:29 2000-04-14T12:49:56

In [195]:
# actually, I have here a set of data that is more fire-specific, that I think is better to use
# but it's not sorted
# https://data.sfgov.org/resource/wbb6-uh78.json
df = pd.read_json('https://data.sfgov.org/resource/wbb6-uh78.json?$order=incident_number&$offset=0')

In [196]:
print(df.incident_number.min())
print(df.incident_number.max())
# offset
# 0 3000001 3003135
# 1 3000003 3003136
# 2 3000006 3003139
# 3 3000007 3003143


3000014
3003149

In [197]:
print(df.alarm_dttm.min(), df.alarm_dttm.max())  # a little bit of a bigger window, nice!
# 0 2003-01-07T02:55:22.000 2015-06-20T02:44:56.000
# 1 2003-01-01T00:07:32.000 2003-01-11T07:49:30.000
# 200 2003-01-02T18:28:53.000 2003-01-13T12:05:02.000


2003-01-01T00:18:19.000 2003-01-11T09:17:51.000

In [ ]: