notebook.community

Edit and run



In [176]:

    
"""
0.1-first-pass-eda.ipynb
The very first pass through, examining the data,
looking around an initial set of data,
while trying to keep it DRY and programmatic,
and reproducable.

RESULT: 
There is something broken or misunderstood about how the script below is using offset.
It needs to be figured out in order to programmatically access the data available 
on data.sfgov.org in an accurate and effective manner.
"""
from __future__ import division, print_function # For Python3 future-proofing
import os
import pandas as pd
import numpy as np



In [177]:

    
# GOTCHYA: be explicit about the API call to ensure the ordering stays the same, and add an ordering to it
df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?$limit=1000&$offset=0&$order=received_dttm')



In [178]:

    
df.head()









    Out[178]:






  
    
      
      address
      als_unit
      available_dttm
      battalion
      box
      call_date
      call_final_disposition
      call_number
      call_type_group
      city
      ...
      response_dttm
      rowid
      station_area
      supervisor_district
      transport_dttm
      unit_id
      unit_sequence_in_call_dispatch
      unit_type
      watch_date
      zipcode_of_incident
    
  
  
    
      0
      600 Block of LAGUNA ST
      False
      2016-01-01T01:17:47
      B02
      3413
      2016-01-01T00:00:00
      Code 2 Transport
      160010009
      Potentially Life-Threatening
      San Francisco
      ...
      2016-01-01T00:10:30
      160010009-KM11
      36
      5
      2016-01-01T00:42:14
      KM11
      1
      PRIVATE
      2015-12-31T00:00:00
      94102
    
    
      1
      600 Block of LAGUNA ST
      False
      2016-01-01T00:15:00
      B02
      3413
      2016-01-01T00:00:00
      Code 2 Transport
      160010009
      Potentially Life-Threatening
      San Francisco
      ...
      2016-01-01T00:05:22
      160010009-KM06
      36
      5
      2016-04-25T12:54:45
      KM06
      2
      PRIVATE
      2015-12-31T00:00:00
      94102
    
    
      2
      PRECITA AV/FLORIDA ST
      True
      2016-01-01T00:12:51
      B06
      5621
      2016-01-01T00:00:00
      No Merit
      160010015
      Fire
      San Francisco
      ...
      2016-01-01T00:05:52
      160010015-E09
      9
      9
      2016-04-25T12:54:45
      E09
      1
      ENGINE
      2015-12-31T00:00:00
      94110
    
    
      3
      1600 Block of EUCALYPTUS DR
      False
      2016-01-01T00:15:04
      B08
      8815
      2016-01-01T00:00:00
      Fire
      160010018
      Alarm
      San Francisco
      ...
      2016-01-01T00:07:06
      160010018-B08
      19
      7
      2016-04-25T12:54:45
      B08
      3
      CHIEF
      2015-12-31T00:00:00
      94132
    
    
      4
      1600 Block of EUCALYPTUS DR
      True
      2016-01-01T00:15:04
      B08
      8815
      2016-01-01T00:00:00
      Fire
      160010018
      Alarm
      San Francisco
      ...
      2016-01-01T00:06:12
      160010018-E18
      19
      7
      2016-04-25T12:54:45
      E18
      2
      ENGINE
      2015-12-31T00:00:00
      94132
    
  

5 rows × 33 columns



In [179]:

    
df.shape  # hmm... so by default the json call is only providing me with 1,000 rows; I'll have to fix that









    Out[179]:





(1000, 33)



In [180]:

    
df.info()  # we need to potentially fix all of these "object" types to be something specific









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 33 columns):
address                           1000 non-null object
als_unit                          1000 non-null bool
available_dttm                    1000 non-null object
battalion                         1000 non-null object
box                               1000 non-null int64
call_date                         1000 non-null object
call_final_disposition            1000 non-null object
call_number                       1000 non-null int64
call_type_group                   998 non-null object
city                              1000 non-null object
dispatch_dttm                     1000 non-null object
entry_dttm                        1000 non-null object
final_priority                    1000 non-null int64
fire_prevention_district          1000 non-null object
hospital_dttm                     1000 non-null object
incident_number                   1000 non-null int64
location                          1000 non-null object
neighborhood_district             1000 non-null object
number_of_alarms                  1000 non-null int64
on_scene_dttm                     1000 non-null object
original_priority                 1000 non-null object
priority                          1000 non-null object
received_dttm                     1000 non-null object
response_dttm                     1000 non-null object
rowid                             1000 non-null object
station_area                      1000 non-null int64
supervisor_district               1000 non-null int64
transport_dttm                    1000 non-null object
unit_id                           1000 non-null object
unit_sequence_in_call_dispatch    1000 non-null int64
unit_type                         1000 non-null object
watch_date                        1000 non-null object
zipcode_of_incident               1000 non-null int64
dtypes: bool(1), int64(9), object(23)
memory usage: 258.8+ KB



In [181]:

    
df.describe(include=['O'])  # let's look at the types of object data









    Out[181]:






  
    
      
      address
      available_dttm
      battalion
      call_date
      call_final_disposition
      call_type_group
      city
      dispatch_dttm
      entry_dttm
      fire_prevention_district
      ...
      on_scene_dttm
      original_priority
      priority
      received_dttm
      response_dttm
      rowid
      transport_dttm
      unit_id
      unit_type
      watch_date
    
  
  
    
      count
      1000
      1000
      1000
      1000
      1000
      998
      1000
      1000
      1000
      1000
      ...
      1000
      1000
      1000
      1000
      1000
      1000
      1000
      1000
      1000
      1000
    
    
      unique
      399
      940
      11
      1
      12
      4
      3
      591
      448
      11
      ...
      772
      5
      4
      446
      952
      1000
      242
      156
      9
      2
    
    
      top
      5700 Block of MISSION ST
      2016-01-01T16:29:55
      B03
      2016-01-01T00:00:00
      Code 2 Transport
      Potentially Life-Threatening
      San Francisco
      2016-01-01T14:21:41
      2016-01-01T16:27:41
      1
      ...
      2016-04-25T12:54:45
      3
      3
      2016-01-01T16:26:12
      2016-04-25T12:54:45
      160011278-73
      2016-04-25T12:54:45
      E03
      ENGINE
      2015-12-31T00:00:00
    
    
      freq
      20
      10
      171
      1000
      486
      445
      990
      13
      20
      167
      ...
      148
      617
      642
      20
      25
      1
      502
      33
      391
      521
    
  

4 rows × 23 columns



In [182]:

    
print(df.call_date.min(), df.call_date.max())  # this shows the min and max values in terms of dates. One day!









    



2016-01-01T00:00:00 2016-01-01T00:00:00



In [183]:

    
for c in df.columns:
    print(c)









    



address
als_unit
available_dttm
battalion
box
call_date
call_final_disposition
call_number
call_type_group
city
dispatch_dttm
entry_dttm
final_priority
fire_prevention_district
hospital_dttm
incident_number
location
neighborhood_district
number_of_alarms
on_scene_dttm
original_priority
priority
received_dttm
response_dttm
rowid
station_area
supervisor_district
transport_dttm
unit_id
unit_sequence_in_call_dispatch
unit_type
watch_date
zipcode_of_incident



In [184]:

    
# What is this fire prevention district about?
df.fire_prevention_district.value_counts(dropna=False)









    Out[184]:





1       167
3       158
2       144
5        96
7        92
6        86
4        75
9        70
10       51
8        47
None     14
Name: fire_prevention_district, dtype: int64



In [185]:

    
df.call_type_group.value_counts(dropna=False)  # curious to see what the call types are









    Out[185]:





Potentially Life-Threatening    445
Non Life-threatening            270
Alarm                           212
Fire                             71
NaN                               2
Name: call_type_group, dtype: int64



In [186]:

    
df.call_final_disposition.value_counts(dropna=False)









    Out[186]:





Code 2 Transport              486
Fire                          268
No Merit                       64
Patient Declined Transport     58
SFPD                           23
Cancelled                      22
Other                          18
Code 3 Transport               16
Medical Examiner               16
Unable to Locate               13
Against Medical Advice         12
Gone on Arrival                 4
Name: call_final_disposition, dtype: int64



In [187]:

    
# now let's try to grab a different url based on the API docs for S
# https://dev.socrata.com/docs/filtering.html
# df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?call_final_disposition=Fire&$limit=1000&$offset=0&$order=:id')
df = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?call_final_disposition=Fire&$limit=1000&$offset=0&$order=received_dttm')



In [188]:

    
df.call_type_group.value_counts(dropna=False)









    Out[188]:





Alarm                           769
Fire                            182
Non Life-threatening             34
Potentially Life-Threatening      9
NaN                               6
Name: call_type_group, dtype: int64



In [189]:

    
df.call_final_disposition.value_counts(dropna=False)









    Out[189]:





Fire    1000
Name: call_final_disposition, dtype: int64



In [190]:

    
# confirm we have different timestamps than before
print(df.call_date.min(), df.call_date.max())  # a little bit of a bigger window, nice!









    



2016-01-01T00:00:00 2016-01-04T00:00:00



In [191]:

    
# now we'll just experiment with the offset to see what we get, preferably a second page of results
df2 = pd.read_json('https://data.sfgov.org/resource/kikm-y2iv.json?$order=received_dttm&call_final_disposition=Fire&$offset=5')
#                      https://data.sfgov.org/resource/enhu-st7v.json



In [192]:

    
print(df2.received_dttm.min(), df2.received_dttm.max())  # a little bit of a bigger window, nice!
# by offset:
# 0 2016-01-01T00:03:02 2016-01-04T15:36:16
# 1 2016-01-01T00:03:02 2016-01-04T15:36:16
# 2 2016-01-01T00:03:02 2016-01-04T15:44:38
# 3 2016-01-01T00:35:02 2016-01-04T15:53:36
# 4 2016-01-01T00:35:02 2016-01-04T15:53:36









    



2016-01-01T00:35:02 2016-01-04T15:53:36



In [193]:

    
# hmmm... it appears I'm missing something where the correct data is not getting returned
# I'll want to open a ticket with Socrates regarding pagination
# in the meantime let me try a different url for access the data to see if that improves things:
# sorted_url = "https://data.sfgov.org/Public-Safety/FD-CFS-sorted-by-earliest-received-date/sg7s-kczx"
df = pd.read_json("https://data.sfgov.org/resource/sg7s-kczx.json?$order=received_dttm&$offset=0")



In [194]:

    
print(df.received_dttm.min(), df.received_dttm.max())  # a little bit of a bigger window, nice!









    



2000-04-12T21:00:29 2000-04-14T12:49:56



In [195]:

    
# actually, I have here a set of data that is more fire-specific, that I think is better to use
# but it's not sorted
# https://data.sfgov.org/resource/wbb6-uh78.json
df = pd.read_json('https://data.sfgov.org/resource/wbb6-uh78.json?$order=incident_number&$offset=0')



In [196]:

    
print(df.incident_number.min())
print(df.incident_number.max())
# offset
# 0 3000001 3003135
# 1 3000003 3003136
# 2 3000006 3003139
# 3 3000007 3003143



In [197]:

    
print(df.alarm_dttm.min(), df.alarm_dttm.max())  # a little bit of a bigger window, nice!
# 0 2003-01-07T02:55:22.000 2015-06-20T02:44:56.000
# 1 2003-01-01T00:07:32.000 2003-01-11T07:49:30.000
# 200 2003-01-02T18:28:53.000 2003-01-13T12:05:02.000









    



2003-01-01T00:18:19.000 2003-01-11T09:17:51.000



In [ ]:

	address	als_unit	available_dttm	battalion	box	call_date	call_final_disposition	call_number	call_type_group	city	...	response_dttm	rowid	station_area	supervisor_district	transport_dttm	unit_id	unit_sequence_in_call_dispatch	unit_type	watch_date	zipcode_of_incident
0	600 Block of LAGUNA ST	False	2016-01-01T01:17:47	B02	3413	2016-01-01T00:00:00	Code 2 Transport	160010009	Potentially Life-Threatening	San Francisco	...	2016-01-01T00:10:30	160010009-KM11	36	5	2016-01-01T00:42:14	KM11	1	PRIVATE	2015-12-31T00:00:00	94102
1	600 Block of LAGUNA ST	False	2016-01-01T00:15:00	B02	3413	2016-01-01T00:00:00	Code 2 Transport	160010009	Potentially Life-Threatening	San Francisco	...	2016-01-01T00:05:22	160010009-KM06	36	5	2016-04-25T12:54:45	KM06	2	PRIVATE	2015-12-31T00:00:00	94102
2	PRECITA AV/FLORIDA ST	True	2016-01-01T00:12:51	B06	5621	2016-01-01T00:00:00	No Merit	160010015	Fire	San Francisco	...	2016-01-01T00:05:52	160010015-E09	9	9	2016-04-25T12:54:45	E09	1	ENGINE	2015-12-31T00:00:00	94110
3	1600 Block of EUCALYPTUS DR	False	2016-01-01T00:15:04	B08	8815	2016-01-01T00:00:00	Fire	160010018	Alarm	San Francisco	...	2016-01-01T00:07:06	160010018-B08	19	7	2016-04-25T12:54:45	B08	3	CHIEF	2015-12-31T00:00:00	94132
4	1600 Block of EUCALYPTUS DR	True	2016-01-01T00:15:04	B08	8815	2016-01-01T00:00:00	Fire	160010018	Alarm	San Francisco	...	2016-01-01T00:06:12	160010018-E18	19	7	2016-04-25T12:54:45	E18	2	ENGINE	2015-12-31T00:00:00	94132

	address	available_dttm	battalion	call_date	call_final_disposition	call_type_group	city	dispatch_dttm	entry_dttm	fire_prevention_district	...	on_scene_dttm	original_priority	priority	received_dttm	response_dttm	rowid	transport_dttm	unit_id	unit_type	watch_date
count	1000	1000	1000	1000	1000	998	1000	1000	1000	1000	...	1000	1000	1000	1000	1000	1000	1000	1000	1000	1000
unique	399	940	11	1	12	4	3	591	448	11	...	772	5	4	446	952	1000	242	156	9	2
top	5700 Block of MISSION ST	2016-01-01T16:29:55	B03	2016-01-01T00:00:00	Code 2 Transport	Potentially Life-Threatening	San Francisco	2016-01-01T14:21:41	2016-01-01T16:27:41	1	...	2016-04-25T12:54:45	3	3	2016-01-01T16:26:12	2016-04-25T12:54:45	160011278-73	2016-04-25T12:54:45	E03	ENGINE	2015-12-31T00:00:00
freq	20	10	171	1000	486	445	990	13	20	167	...	148	617	642	20	25	1	502	33	391	521