In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
In [11]:
# Data from https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9
df = pd.read_csv("311.csv")
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (17,39,40,41,47,48,49) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
In [3]:
len(df)
Out[3]:
199999
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 53 columns):
Unique Key 199999 non-null int64
Created Date 199999 non-null object
Closed Date 195983 non-null object
Agency 199999 non-null object
Agency Name 199999 non-null object
Complaint Type 199999 non-null object
Descriptor 199132 non-null object
Location Type 78180 non-null object
Incident Zip 171566 non-null float64
Incident Address 125786 non-null object
Street Name 125785 non-null object
Cross Street 1 132623 non-null object
Cross Street 2 130444 non-null object
Intersection Street 1 71467 non-null object
Intersection Street 2 71446 non-null object
Address Type 195273 non-null object
City 171847 non-null object
Landmark 9 non-null object
Facility Type 29626 non-null object
Status 199999 non-null object
Due Date 22279 non-null object
Resolution Description 197477 non-null object
Resolution Action Updated Date 197302 non-null object
Community Board 199999 non-null object
Borough 199999 non-null object
X Coordinate (State Plane) 170540 non-null float64
Y Coordinate (State Plane) 170540 non-null float64
Park Facility Name 199999 non-null object
Park Borough 199999 non-null object
School Name 199999 non-null object
School Number 199962 non-null object
School Region 199715 non-null object
School Code 199715 non-null object
School Phone Number 199999 non-null object
School Address 199999 non-null object
School City 199999 non-null object
School State 199999 non-null object
School Zip 199999 non-null object
School Not Found 22080 non-null object
School or Citywide Complaint 13 non-null object
Vehicle Type 12 non-null object
Taxi Company Borough 64 non-null object
Taxi Pick Up Location 444 non-null object
Bridge Highway Name 155 non-null object
Bridge Highway Direction 155 non-null object
Road Ramp 151 non-null object
Bridge Highway Segment 160 non-null object
Garage Lot Name 28 non-null object
Ferry Direction 9 non-null object
Ferry Terminal Name 65 non-null object
Latitude 170540 non-null float64
Longitude 170540 non-null float64
Location 170540 non-null object
dtypes: float64(5), int64(1), object(47)
memory usage: 80.9+ MB
In [12]:
def parse_date(str_date):
return dateutil.parser.parse(str_date)
df['happened_at'] = df['Created Date'].apply(parse_date)
In [15]:
# How many ended up as not being parsed?
pd.isnull(df['happened_at']).value_counts()
Out[15]:
False 199999
Name: happened_at, dtype: int64
In [ ]:
# Move the index over, delete 'happened_at'
df.index = df['happened_at']
del df['happened_at']
In [21]:
# What's our range looking like?
print(df.index.min())
print(df.index.max())
2010-01-04 00:00:00
2016-05-20 01:13:15
In [22]:
# Let's group by year and see how many we have
df.resample("A").count()
Out[22]:
Unique Key
Created Date
Closed Date
Agency
Agency Name
Complaint Type
Descriptor
Location Type
Incident Zip
Incident Address
...
Bridge Highway Name
Bridge Highway Direction
Road Ramp
Bridge Highway Segment
Garage Lot Name
Ferry Direction
Ferry Terminal Name
Latitude
Longitude
Location
happened_at
2010-12-31
89509
89509
88895
89509
89509
89509
89441
26313
75468
54647
...
9
9
9
9
6
0
31
74914
74914
74914
2011-12-31
78604
78604
77998
78604
78604
78604
78581
25783
66446
45087
...
17
17
17
17
6
0
13
66529
66529
66529
2012-12-31
17
17
16
17
17
17
17
12
15
13
...
0
0
0
0
0
0
0
15
15
15
2013-12-31
17
17
9
17
17
17
17
10
13
13
...
1
1
1
1
0
0
0
12
12
12
2014-12-31
189
189
139
189
189
189
189
103
173
180
...
0
0
0
0
0
0
0
173
173
173
2015-12-31
2069
2069
1919
2069
2069
2069
2068
1031
1820
1623
...
0
0
0
0
0
0
0
1811
1811
1811
2016-12-31
29594
29594
27007
29594
29594
29594
28819
24928
27631
24223
...
128
128
124
133
16
9
21
27086
27086
27086
7 rows × 53 columns
In [24]:
# Since we don't have the entire data set we're missing a lot.
df.resample("A").count().plot(y='Unique Key')
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b69c0b8>
In [26]:
# Let's just grab 2010 then
df_2010 = df["2010"]
df_2010.head(2)
Out[26]:
Unique Key
Created Date
Closed Date
Agency
Agency Name
Complaint Type
Descriptor
Location Type
Incident Zip
Incident Address
...
Bridge Highway Name
Bridge Highway Direction
Road Ramp
Bridge Highway Segment
Garage Lot Name
Ferry Direction
Ferry Terminal Name
Latitude
Longitude
Location
happened_at
2010-11-18 10:53:00
19157621
11/18/2010 10:53:00 AM
11/18/2010 10:53:00 AM
DOT
Department of Transportation
Street Light Condition
Street Light Out
NaN
11422.0
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
40.654108
-73.746911
(40.654107999915, -73.74691077345408)
2010-11-01 17:55:00
19025135
11/01/2010 05:55:00 PM
11/01/2010 08:00:00 PM
DEP
Department of Environmental Protection
Asbestos
Asbestos Complaint (B1)
NaN
10036.0
235 W 46 ST
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
40.759129
-73.986341
(40.759128565735466, -73.98634107118339)
2 rows × 53 columns
In [31]:
df_2010.resample("B").count()
Out[31]:
Unique Key
Created Date
Closed Date
Agency
Agency Name
Complaint Type
Descriptor
Location Type
Incident Zip
Incident Address
...
Bridge Highway Name
Bridge Highway Direction
Road Ramp
Bridge Highway Segment
Garage Lot Name
Ferry Direction
Ferry Terminal Name
Latitude
Longitude
Location
happened_at
2010-01-04
4
4
4
4
4
4
4
0
4
3
...
0
0
0
0
0
0
0
4
4
4
2010-01-05
1
1
1
1
1
1
1
0
1
1
...
0
0
0
0
0
0
0
1
1
1
2010-01-06
1
1
1
1
1
1
1
1
1
1
...
0
0
0
0
0
0
0
1
1
1
2010-01-07
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-08
1
1
1
1
1
1
1
0
1
1
...
0
0
0
0
0
0
0
1
1
1
2010-01-11
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-12
1
1
1
1
1
1
1
0
1
1
...
0
0
0
0
0
0
0
1
1
1
2010-01-13
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-14
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-15
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-18
2
2
2
2
2
2
2
0
1
1
...
0
0
0
0
0
0
0
1
1
1
2010-01-19
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-20
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-21
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-22
1
1
1
1
1
1
1
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-25
2
2
2
2
2
2
2
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-26
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-27
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-28
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-01-29
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-02-01
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-02-02
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-02-03
1
1
1
1
1
1
1
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-02-04
2
2
2
2
2
2
2
0
1
1
...
0
0
0
0
0
0
0
1
1
1
2010-02-05
6
6
6
6
6
6
6
1
3
3
...
0
0
0
0
0
0
0
3
3
3
2010-02-08
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-02-09
2
2
2
2
2
2
2
0
1
2
...
0
0
0
0
0
0
0
1
1
1
2010-02-10
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-02-11
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2010-02-12
1
1
1
1
1
1
1
0
1
1
...
0
0
0
0
0
0
0
1
1
1
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2010-11-22
1767
1767
1749
1767
1767
1767
1767
425
1495
1081
...
1
1
1
1
0
0
0
1495
1495
1495
2010-11-23
1773
1773
1764
1773
1773
1773
1773
522
1560
1103
...
0
0
0
0
0
0
0
1562
1562
1562
2010-11-24
1685
1685
1677
1685
1685
1685
1684
444
1398
999
...
0
0
0
0
0
0
0
1398
1398
1398
2010-11-25
493
493
492
493
493
493
493
111
448
339
...
0
0
0
0
0
0
0
450
450
450
2010-11-26
2342
2342
2326
2342
2342
2342
2342
749
2122
1644
...
2
2
2
2
0
0
0
2124
2124
2124
2010-11-29
1731
1731
1726
1731
1731
1731
1729
455
1452
1089
...
0
0
0
0
0
0
1
1451
1451
1451
2010-11-30
1853
1853
1847
1853
1853
1853
1852
468
1571
1190
...
0
0
0
0
0
0
1
1575
1575
1575
2010-12-01
2441
2441
2432
2441
2441
2441
2441
459
1928
1328
...
1
1
1
1
0
0
0
1928
1928
1928
2010-12-02
1920
1920
1912
1920
1920
1920
1919
504
1607
1204
...
1
1
1
1
0
0
1
1609
1609
1609
2010-12-03
3348
3348
3336
3348
3348
3348
3346
1112
2896
2233
...
0
0
0
0
0
0
1
2898
2898
2898
2010-12-06
2009
2009
1997
2009
2009
2009
2009
509
1719
1205
...
0
0
0
0
0
0
0
1721
1721
1721
2010-12-07
1862
1862
1846
1862
1862
1862
1861
477
1495
1084
...
0
0
0
0
0
0
1
1495
1495
1495
2010-12-08
1940
1940
1930
1940
1940
1940
1939
549
1651
1271
...
0
0
0
0
0
0
1
1653
1653
1653
2010-12-09
1982
1982
1968
1982
1982
1982
1979
447
1526
1130
...
0
0
0
0
0
0
3
1526
1526
1526
2010-12-10
3127
3127
3105
3127
3127
3127
3126
926
2738
2036
...
0
0
0
0
2
0
1
2739
2739
2739
2010-12-13
2058
2058
2043
2058
2058
2058
2058
478
1655
1092
...
0
0
0
0
0
0
0
1656
1656
1656
2010-12-14
1978
1978
1965
1978
1978
1978
1977
497
1541
1048
...
0
0
0
0
0
0
0
1541
1541
1541
2010-12-15
1958
1958
1947
1958
1958
1958
1958
619
1642
1164
...
1
1
1
1
0
0
0
1642
1642
1642
2010-12-16
2080
2080
2051
2080
2080
2080
2078
554
1739
1245
...
0
0
0
0
0
0
2
1739
1739
1739
2010-12-17
3245
3245
3228
3245
3245
3245
3242
903
2700
1900
...
0
0
0
0
1
0
1
2701
2701
2701
2010-12-20
1790
1790
1785
1790
1790
1790
1789
425
1456
1010
...
0
0
0
0
0
0
0
1456
1456
1456
2010-12-21
1907
1907
1891
1907
1907
1907
1906
487
1559
1133
...
0
0
0
0
0
0
1
1560
1560
1560
2010-12-22
1746
1746
1741
1746
1746
1746
1745
386
1401
1033
...
0
0
0
0
0
0
0
1402
1402
1402
2010-12-23
1496
1496
1492
1496
1496
1496
1496
364
1181
832
...
0
0
0
0
0
0
0
1182
1182
1182
2010-12-24
1336
1336
1334
1336
1336
1336
1334
470
1174
890
...
1
1
1
1
0
0
2
1175
1175
1175
2010-12-27
394
394
394
394
394
394
394
149
339
243
...
0
0
0
0
0
0
0
339
339
339
2010-12-28
969
969
966
969
969
969
969
303
736
514
...
0
0
0
0
0
0
0
736
736
736
2010-12-29
1252
1252
1247
1252
1252
1252
1251
510
1058
738
...
0
0
0
0
0
0
1
1058
1058
1058
2010-12-30
1872
1872
1851
1872
1872
1872
1870
991
1633
1014
...
1
1
1
1
0
0
1
1633
1633
1633
2010-12-31
1475
1475
1473
1475
1475
1475
1475
1120
1398
943
...
0
0
0
0
0
0
0
1398
1398
1398
260 rows × 53 columns
In [33]:
# Well this is the downside of incomplete data!
df_2010.resample("B").count().plot(y='Unique Key')
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d97a5f8>
In [35]:
df.groupby(df.index.hour).count().plot(y='Unique Key')
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x10aeb39b0>
In [36]:
noise_df = df[df['Complaint Type'].str.contains("Noise")]
noise_df.groupby(noise_df.index.hour).count().plot(y='Unique Key')
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x10dd584a8>
In [37]:
# Combine the two
noise_df = df[df['Complaint Type'].str.contains("Noise")]
ax = df.groupby(df.index.hour).count().plot(y='Unique Key')
noise_df.groupby(noise_df.index.hour).count().plot(y='Unique Key', ax=ax)
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e8587b8>
In [39]:
df['Complaint Type'].value_counts()
Out[39]:
Street Light Condition 30886
Traffic Signal Condition 18492
Street Condition 18024
Snow 13831
Water System 13623
Sewer 10798
Dirty Conditions 8298
Noise 6858
HEATING 6172
General Construction/Plumbing 5940
Sanitation Condition 5023
Building/Use 5015
Missed Collection (All Materials) 4057
Graffiti 3574
HEAT/HOT WATER 3264
Noise - Residential 2753
Derelict Vehicles 2742
Blocked Driveway 2602
Illegal Parking 2596
PLUMBING 1590
Air Quality 1526
Special Enforcement 1237
Elevator 1214
Broken Muni Meter 1196
Other Enforcement 1169
Lead 1123
Noise - Commercial 1088
GENERAL CONSTRUCTION 1057
Plumbing 951
Water Conservation 913
...
Sweeping/Inadequate 2
Discipline and Suspension 2
Ferry Complaint 2
Bike Rack Condition 2
DOF Property - RPIE Issue 2
Complaint 2
Illegal Fireworks 2
Special Natural Area District (SNAD) 2
Literature Request 1
Public Toilet 1
Radioactive Material 1
Poison Ivy 1
Parking Card 1
Unsanitary Animal Facility 1
DOF Parking - Address Update 1
Health 1
Research Questions 1
Sprinkler - Mechanical 1
Bottled Water 1
Hazmat Storage/Use 1
Weatherization 1
Tunnel Condition 1
Transportation Provider Complaint 1
Rangehood 1
Construction 1
Home Delivered Meal - Missed Delivery 1
Animal Facility - No Permit 1
Lifeguard 1
Forensic Engineering 1
Beach/Pool/Sauna Complaint 1
Name: Complaint Type, dtype: int64
In [ ]:
Content source: skkandrach/foundations-homework
Similar notebooks: