In [1]:
datafile = "nyc_311_data_subset-2.csv"
In [2]:
import pandas as pd
import numpy as np
In [3]:
data = pd.read_csv(datafile)
data
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
Unique Key
Created Date
Closed Date
Agency
Incident Zip
Borough
Latitude
Longitude
0
1
10/11/16 11:53
10/11/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
1
2
10/11/16 11:36
10/11/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
2
3
10/11/16 11:36
10/11/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
3
4
10/11/16 12:39
10/11/16 12:39
DSNY
NaN
QUEENS
NaN
NaN
4
5
10/11/16 12:18
10/11/16 12:18
DSNY
NaN
QUEENS
NaN
NaN
5
6
10/11/16 11:36
10/11/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
6
7
10/11/16 12:39
10/11/16 12:39
DSNY
NaN
QUEENS
NaN
NaN
7
8
10/11/16 11:44
NaN
DSNY
NaN
QUEENS
NaN
NaN
8
9
10/11/16 12:35
10/11/16 12:35
DSNY
NaN
QUEENS
NaN
NaN
9
10
10/11/16 11:26
5/27/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
10
11
9/1/16 0:33
9/16/16 1:06
DCA
10001
MANHATTAN
40.744790
-73.988834
11
12
9/1/16 20:16
9/10/16 18:08
HPD
11691
QUEENS
40.600554
-73.750704
12
13
9/1/16 12:17
9/7/16 12:00
DSNY
11211
BROOKLYN
40.704925
-73.962007
13
14
9/1/16 12:10
9/10/16 14:23
HPD
10027
MANHATTAN
40.812322
-73.955338
14
15
9/1/16 12:32
9/11/16 2:03
HPD
10452
BRONX
40.839529
-73.922534
15
16
9/1/16 20:16
9/10/16 18:08
HPD
11691
QUEENS
40.600554
-73.750704
16
17
9/1/16 8:35
9/7/16 12:00
DSNY
11428
QUEENS
40.721866
-73.745982
17
18
9/1/16 13:19
9/16/16 14:32
DOT
11101
QUEENS
40.746875
-73.952711
18
19
9/1/16 11:00
9/8/16 12:00
DSNY
10075
MANHATTAN
40.773336
-73.955054
19
20
9/1/16 11:45
9/3/16 12:00
DSNY
11215
BROOKLYN
40.662002
-73.983668
20
21
9/1/16 10:11
9/16/16 11:11
DOT
11101
QUEENS
40.745492
-73.953174
21
22
9/1/16 8:22
9/16/16 7:42
DOT
11210
BROOKLYN
40.631530
-73.954573
22
23
9/1/16 17:31
9/16/16 10:10
DOT
11231
BROOKLYN
40.677629
-73.998190
23
24
9/1/16 8:50
9/16/16 11:12
DOT
11210
BROOKLYN
40.628450
-73.947200
24
25
9/1/16 14:19
9/16/16 14:11
DOT
11101
QUEENS
40.745866
-73.955148
25
26
9/1/16 12:46
9/16/16 11:53
DOT
11217
BROOKLYN
40.687518
-73.986320
26
27
9/1/16 13:33
9/16/16 12:55
DOT
11231
BROOKLYN
40.683742
-73.995233
27
28
9/1/16 13:51
9/11/16 2:05
HPD
10457
BRONX
40.852417
-73.899753
28
29
9/1/16 12:28
9/1/16 12:28
DOT
NaN
BRONX
NaN
NaN
29
30
9/1/16 12:34
9/10/16 15:01
HPD
10033
MANHATTAN
40.846318
-73.932740
...
...
...
...
...
...
...
...
...
971033
971034
9/20/16 13:36
9/20/16 16:14
NYPD
11357
QUEENS
40.789638
-73.814111
971034
971035
9/20/16 21:22
9/21/16 10:21
DOT
11368
QUEENS
40.756870
-73.857055
971035
971036
9/20/16 12:27
9/21/16 0:30
DOT
10461
BRONX
40.838262
-73.836541
971036
971037
9/20/16 14:25
9/26/16 13:57
DOF
NaN
Unspecified
NaN
NaN
971037
971038
9/20/16 13:55
9/22/16 8:10
DEP
11217
BROOKLYN
40.677542
-73.979681
971038
971039
9/20/16 13:25
9/21/16 10:59
DOF
10013
MANHATTAN
NaN
NaN
971039
971040
9/20/16 14:08
9/21/16 14:08
DEP
11238
BROOKLYN
40.673232
-73.965010
971040
971041
9/20/16 7:51
9/21/16 11:46
DOT
11222
BROOKLYN
40.725309
-73.957521
971041
971042
9/20/16 19:57
9/20/16 19:58
HRA
NaN
Unspecified
NaN
NaN
971042
971043
9/20/16 8:56
9/20/16 10:20
DEP
11209
BROOKLYN
40.632837
-74.027230
971043
971044
9/20/16 6:22
9/23/16 12:45
DEP
10029
MANHATTAN
40.787172
-73.949982
971044
971045
9/20/16 3:40
NaN
DEP
11366
QUEENS
40.720328
-73.807761
971045
971046
9/20/16 0:06
9/22/16 22:45
DEP
10027
MANHATTAN
40.808013
-73.950129
971046
971047
9/20/16 3:05
9/20/16 7:58
NYPD
10030
MANHATTAN
40.816358
-73.940220
971047
971048
9/20/16 9:19
9/27/16 15:20
DOT
NaN
BROOKLYN
NaN
NaN
971048
971049
9/20/16 20:42
1/13/17 8:56
DOT
10314
STATEN ISLAND
40.611248
-74.169591
971049
971050
9/20/16 16:42
9/20/16 18:40
DOT
NaN
QUEENS
NaN
NaN
971050
971051
9/20/16 11:47
9/16/16 11:47
DOT
11414
QUEENS
40.648070
-73.844823
971051
971052
9/20/16 18:31
9/20/16 20:42
NYPD
11205
BROOKLYN
40.694865
-73.979214
971052
971053
9/20/16 19:35
9/20/16 21:39
NYPD
11205
BROOKLYN
40.694865
-73.979214
971053
971054
9/20/16 8:10
9/23/16 11:28
DOT
NaN
QUEENS
NaN
NaN
971054
971055
9/20/16 10:19
9/21/16 5:09
DHS
10025
MANHATTAN
40.796495
-73.970431
971055
971056
9/20/16 17:01
9/20/16 21:34
NYPD
10024
MANHATTAN
40.788312
-73.976502
971056
971057
9/20/16 4:11
9/20/16 11:30
DEP
10014
MANHATTAN
40.734051
-74.004247
971057
971058
9/20/16 9:42
9/20/16 11:11
DOF
10024
MANHATTAN
NaN
NaN
971058
971059
9/20/16 15:02
9/24/16 9:33
HPD
10469
BRONX
40.877527
-73.847712
971059
971060
9/20/16 11:36
10/21/16 20:02
HPD
10029
MANHATTAN
40.791709
-73.937064
971060
971061
9/20/16 16:41
9/24/16 9:41
HPD
10460
BRONX
40.833706
-73.895943
971061
971062
9/20/16 11:36
10/21/16 20:02
HPD
10029
MANHATTAN
40.791709
-73.937064
971062
971063
9/20/16 13:41
10/11/16 2:04
HPD
11432
QUEENS
40.711279
-73.788951
971063 rows × 8 columns
In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 971063 entries, 0 to 971062
Data columns (total 8 columns):
Unique Key 971063 non-null int64
Created Date 971063 non-null object
Closed Date 882944 non-null object
Agency 971063 non-null object
Incident Zip 911140 non-null object
Borough 971063 non-null object
Latitude 887284 non-null float64
Longitude 887284 non-null float64
dtypes: float64(2), int64(1), object(5)
memory usage: 59.3+ MB
In [5]:
data = pd.read_csv(datafile,index_col='Unique Key')
/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
In [6]:
data.iloc[1:10]
Out[6]:
Created Date
Closed Date
Agency
Incident Zip
Borough
Latitude
Longitude
Unique Key
2
10/11/16 11:36
10/11/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
3
10/11/16 11:36
10/11/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
4
10/11/16 12:39
10/11/16 12:39
DSNY
NaN
QUEENS
NaN
NaN
5
10/11/16 12:18
10/11/16 12:18
DSNY
NaN
QUEENS
NaN
NaN
6
10/11/16 11:36
10/11/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
7
10/11/16 12:39
10/11/16 12:39
DSNY
NaN
QUEENS
NaN
NaN
8
10/11/16 11:44
NaN
DSNY
NaN
QUEENS
NaN
NaN
9
10/11/16 12:35
10/11/16 12:35
DSNY
NaN
QUEENS
NaN
NaN
10
10/11/16 11:26
5/27/16 12:00
DSNY
NaN
QUEENS
NaN
NaN
In [7]:
data['Incident Zip'].unique()
Out[7]:
array([nan, '10001', '11691', '11211', '10027', '10452', '11428', '11101',
'10075', '11215', '11210', '11231', '11217', '10457', '10033',
'11209', '11201', '11367', '10029', '10021', '10028', '10034',
'10032', '10039', '11414', '10461', '11229', '10462', '11223',
'10023', '10453', '11225', '11219', '10451', '11234', '10014',
'11354', '11361', '10468', '11233', '10466', '11204', '11413',
'11224', '11375', '11040', '11232', '11203', '11205', '11434',
'10011', '10003', '10025', '10013', '10036', '11237', '11355',
'11368', '10454', '10456', '10463', '11222', '11228', '11216',
'10128', '11435', '11419', '11358', '11421', '10019', '11238',
'11213', '11235', '11420', '10038', '11226', '10472', '10016',
'11221', '11236', '11436', '11214', '11377', '11385', '11365',
'10312', '11426', '11373', '11218', '10005', '11230', '10026',
'10473', '10280', '10301', '10309', '10310', '10009', '10002',
'11433', '10020', '11357', '10030', '11378', '11249', '11432',
'11212', '10024', '10035', '11429', '11206', '11372', '10471',
'10119', '10307', '11364', '11103', '10017', '10012', '11105',
'10458', '10018', '11374', '10459', '10314', '10037', '10302',
'10040', '11411', '11692', '10303', '11418', '10031', '11220',
'11427', '10465', '10306', '10010', '10460', '10305', '11207',
'11208', '10474', '11417', '10475', '10455', '11416', '10065',
'11363', '11693', '10308', '11356', '10469', '11369', '10470',
'10467', '10007', '10304', '11366', '11694', '11102', '11423',
'11422', '19044', '11412', '10022', '11379', '11251', '30374-0283',
'11004', '11104', '10004', '11362', '11360', '11109', '7047',
'11590', '11001', '11430', '11106', '10464', '43216', '11370',
'10271', '11239', '11415', '10006', '10121', '10177', '11030-3409',
'10069', '7302', '0', '10044', '10282', '10000', '11743', '10178',
'7114', '7410', '13851', '10169', '11697', '10168', '10538',
'10123', '11516', '10174', '10278', '11598', '10112', '10605',
'UNKNOWN', '10172', '11570', '7105', '11561', '11576', '10162',
'10281', '10151', '220194', '99999', '83', '44087', '11030',
'11725', '10111', '11005', '11803', '90274', '11520', '7112',
'10041', '10167', '10107', '10120', '7073', '10154', '10165',
'11550', '11701', '33134', '7628', '10532', '10703', '11371',
'10105', '18773', '11580', '11725-9030', '10158', '7722', '11779',
'11359', '10153', '10604', '10279', '29601', '30006', '11792',
'11747', '10118', '11566', '11776', '12344', '10103', '10801',
'11509', '10701', '10913', '11518', '11002', '10573', '10601',
'11581', '11710', '42101', '10803', '11050', '3358', '8876',
'10179', '12222', '11003', '10110', 'UNKOWN', '11636', '98057',
'10550', '7310', '10115', '60179', '7093', '7306', '10170', '61702',
'10804', '11791', '7013', '11042', '7024', '29616-0759', '14094',
'11040-4204', '55433', '33033', '11501', '11514', '11021',
'02241-5789', '91711', '11735', '7104', '48393-0824', '30353-0949',
'10173', '2298', '33433', '33309', '10106', '10045', '7101', '7307',
'1581', '11244', '7086', '?', '11530', 11433.0, 11213.0, 10037.0,
10030.0, 10462.0, 10035.0, 10458.0, 11377.0, 11691.0, 10036.0,
10002.0, 11355.0, 11419.0, 11209.0, 10460.0, 11226.0, 10011.0,
11233.0, 10033.0, 10009.0, 10463.0, 10459.0, 10031.0, 10034.0,
10472.0, 11372.0, 10025.0, 10301.0, 11357.0, 11216.0, 11230.0,
10302.0, 10452.0, 11234.0, 10003.0, 10453.0, 10456.0, 10473.0,
11238.0, 10027.0, 10451.0, 11218.0, 11208.0, 11220.0, 10032.0,
11366.0, 11214.0, 11236.0, 11223.0, 10303.0, 11207.0, 11378.0,
10466.0, 11212.0, 11219.0, 10028.0, 11201.0, 11373.0, 10023.0,
10468.0, 11105.0, 11222.0, 10304.0, 11413.0, 10467.0, 10039.0,
11420.0, 10457.0, 10016.0, 11414.0, 11104.0, 11221.0, 10040.0,
11435.0, 11385.0, 10026.0, 11368.0, 11215.0, 11412.0, 11001.0,
10010.0, 10306.0, 11101.0, 11415.0, 11375.0, 10465.0, 10024.0,
10021.0, 11237.0, 10014.0, 11229.0, 11217.0, 10310.0, 10461.0,
10012.0, 11356.0, 11228.0, 11205.0, 10001.0, 11210.0, 10455.0,
11411.0, 10005.0, 11418.0, 11232.0, 10471.0, 10029.0, 10308.0,
11225.0, 10019.0, 11422.0, 10006.0, 11224.0, 11354.0, 11249.0,
11434.0, 10282.0, 11374.0, 10128.0, 10013.0, 11367.0, 10065.0,
11231.0, 11211.0, 11432.0, 11692.0, 11235.0, 11204.0, 11361.0,
10017.0, 11103.0, 10020.0, 10312.0, 11365.0, 10018.0, 11206.0,
11416.0, 10305.0, 11370.0, 11203.0, 11379.0, 11369.0, 10469.0,
10314.0, 10075.0, 11436.0, 11429.0, 10022.0, 11428.0, 10470.0,
10474.0, 11358.0, 11102.0, 11427.0, 10454.0, 10309.0, 11421.0,
11423.0, 11106.0, 11362.0, 10307.0, 10464.0, 11360.0, 10038.0,
11426.0, 10112.0, 11239.0, 11004.0, 11417.0, 11693.0, 11364.0,
11021.0, 7114.0, 11694.0, 10475.0, 10007.0, 10280.0, 10069.0,
8861.0, 10279.0, 10044.0, 8817.0, 10004.0, 11363.0, 11005.0,
11697.0, 10281.0, 10111.0, 11040.0, 11430.0, 10118.0, 10121.0,
7302.0, 11251.0, 10803.0, 10119.0, 10580.0, 11109.0, 11749.0,
19053.0, 10708.0, 10165.0, 10122.0, 10176.0, 10000.0, 10178.0,
11520.0, 10583.0, 10048.0, 10168.0, 10177.0, 83.0, 8901.0, 10110.0,
10169.0, 7310.0, 8512.0, 97062.0, 10167.0, 11743.0, 11371.0,
80014.0, 11722.0, 10155.0, 11554.0, 10123.0, 6870.0, 10103.0,
12222.0, 10041.0, 10106.0, 7086.0, 1749.0, 11042.0, 11719.0,
11024.0, 11801.0, 10162.0, 7208.0, 97114.0, 11571.0, 10278.0,
19034.0, 11695.0, 11968.0, 11010.0, 0.0, '10707', '10155', '10580',
'11241', '11553', '10176', '11510', '11802-9060', '11757', '19040',
'11758', '11746', '30346', '11695', '11575', '11572', '10960',
'10710', '18901', '42301', '32752-9963', '10705', '11557', '3830',
'10134', '7029', '10055', '10704', '11558', '7960', '12209', '7712',
'11797', '14624', '10048', '14222', '19962', '19399', '2346',
'11756-0215', '11111', '7608', '10530', '07103-4154', '11801',
'10583', '11793', '10166', 'NJ 07114', '6601', '10023-6349',
'10805', '877', '7660', '11563', '9153', '14231-9054', '11729',
'11559', '11431', '7747', '7040', '11579', '10152', '92131',
'19123', '90054-0807', '55486-0102', '37909', '10543', '11706',
'7030', '91942', '14450', '7090', '12550', '12553', '10567',
'10259', '11552', '92193', '1757', '11554', '10989', '11219-0283',
'7070', '37214', '17108-0988', '43218', '2114', '10553',
'43218-2118', '32256', '7640', '7087', 11561.0, 10154.0, 11359.0,
11530.0, 10977.0, 10173.0, 7108.0, 10271.0, 6902.0, 10120.0,
10153.0, 10158.0, 11590.0, 99999.0, 7030.0, 10591.0, 8043.0,
10174.0, 105457.0, 10604.0, 7304.0, 10573.0, '10801-6393', '20705',
'33128', '14228', '11582', '10008', '7094', '11577', '7201',
'10122', '29615', '19047', '2062', '12205', '11704', '11242',
'11797-9001', '10129', 'JFK', '1853', '11768'], dtype=object)
In [ ]:
In [8]:
def fix_zip(input_zip):
try:
input_zip = int(float(input_zip))
except:
try:
input_zip = int(input_zip.split('-')[0])
except:
return np.NaN
if input_zip < 10000 or input_zip > 19999:
return np.NaN
return str(input_zip)
In [9]:
fix_zip('11211.00')
Out[9]:
'11211'
In [10]:
data['Incident Zip'] = data['Incident Zip'].apply(fix_zip)
In [11]:
data['Incident Zip'].unique()
Out[11]:
array([nan, '10001', '11691', '11211', '10027', '10452', '11428', '11101',
'10075', '11215', '11210', '11231', '11217', '10457', '10033',
'11209', '11201', '11367', '10029', '10021', '10028', '10034',
'10032', '10039', '11414', '10461', '11229', '10462', '11223',
'10023', '10453', '11225', '11219', '10451', '11234', '10014',
'11354', '11361', '10468', '11233', '10466', '11204', '11413',
'11224', '11375', '11040', '11232', '11203', '11205', '11434',
'10011', '10003', '10025', '10013', '10036', '11237', '11355',
'11368', '10454', '10456', '10463', '11222', '11228', '11216',
'10128', '11435', '11419', '11358', '11421', '10019', '11238',
'11213', '11235', '11420', '10038', '11226', '10472', '10016',
'11221', '11236', '11436', '11214', '11377', '11385', '11365',
'10312', '11426', '11373', '11218', '10005', '11230', '10026',
'10473', '10280', '10301', '10309', '10310', '10009', '10002',
'11433', '10020', '11357', '10030', '11378', '11249', '11432',
'11212', '10024', '10035', '11429', '11206', '11372', '10471',
'10119', '10307', '11364', '11103', '10017', '10012', '11105',
'10458', '10018', '11374', '10459', '10314', '10037', '10302',
'10040', '11411', '11692', '10303', '11418', '10031', '11220',
'11427', '10465', '10306', '10010', '10460', '10305', '11207',
'11208', '10474', '11417', '10475', '10455', '11416', '10065',
'11363', '11693', '10308', '11356', '10469', '11369', '10470',
'10467', '10007', '10304', '11366', '11694', '11102', '11423',
'11422', '19044', '11412', '10022', '11379', '11251', '11004',
'11104', '10004', '11362', '11360', '11109', '11590', '11001',
'11430', '11106', '10464', '11370', '10271', '11239', '11415',
'10006', '10121', '10177', '11030', '10069', '10044', '10282',
'10000', '11743', '10178', '13851', '10169', '11697', '10168',
'10538', '10123', '11516', '10174', '10278', '11598', '10112',
'10605', '10172', '11570', '11561', '11576', '10162', '10281',
'10151', '11725', '10111', '11005', '11803', '11520', '10041',
'10167', '10107', '10120', '10154', '10165', '11550', '11701',
'10532', '10703', '11371', '10105', '18773', '11580', '10158',
'11779', '11359', '10153', '10604', '10279', '11792', '11747',
'10118', '11566', '11776', '12344', '10103', '10801', '11509',
'10701', '10913', '11518', '11002', '10573', '10601', '11581',
'11710', '10803', '11050', '10179', '12222', '11003', '10110',
'11636', '10550', '10115', '10170', '10804', '11791', '11042',
'14094', '11501', '11514', '11021', '11735', '10173', '10106',
'10045', '11244', '11530', '10580', '11749', '19053', '10708',
'10122', '10176', '10583', '10048', '11722', '10155', '11554',
'11719', '11024', '11801', '11571', '19034', '11695', '11968',
'11010', '10707', '11241', '11553', '11510', '11802', '11757',
'19040', '11758', '11746', '11575', '11572', '10960', '10710',
'18901', '10705', '11557', '10134', '10055', '10704', '11558',
'12209', '11797', '14624', '14222', '19962', '19399', '11756',
'11111', '10530', '11793', '10166', '10805', '11563', '14231',
'11729', '11559', '11431', '11579', '10152', '19123', '10543',
'11706', '14450', '12550', '12553', '10567', '10259', '11552',
'10989', '17108', '10553', '10977', '10591', '14228', '11582',
'10008', '11577', '19047', '12205', '11704', '11242', '10129',
'11768'], dtype=object)
In [12]:
data = data[data['Incident Zip'].notnull()]
In [13]:
data
Out[13]:
Created Date
Closed Date
Agency
Incident Zip
Borough
Latitude
Longitude
Unique Key
11
9/1/16 0:33
9/16/16 1:06
DCA
10001
MANHATTAN
40.744790
-73.988834
12
9/1/16 20:16
9/10/16 18:08
HPD
11691
QUEENS
40.600554
-73.750704
13
9/1/16 12:17
9/7/16 12:00
DSNY
11211
BROOKLYN
40.704925
-73.962007
14
9/1/16 12:10
9/10/16 14:23
HPD
10027
MANHATTAN
40.812322
-73.955338
15
9/1/16 12:32
9/11/16 2:03
HPD
10452
BRONX
40.839529
-73.922534
16
9/1/16 20:16
9/10/16 18:08
HPD
11691
QUEENS
40.600554
-73.750704
17
9/1/16 8:35
9/7/16 12:00
DSNY
11428
QUEENS
40.721866
-73.745982
18
9/1/16 13:19
9/16/16 14:32
DOT
11101
QUEENS
40.746875
-73.952711
19
9/1/16 11:00
9/8/16 12:00
DSNY
10075
MANHATTAN
40.773336
-73.955054
20
9/1/16 11:45
9/3/16 12:00
DSNY
11215
BROOKLYN
40.662002
-73.983668
21
9/1/16 10:11
9/16/16 11:11
DOT
11101
QUEENS
40.745492
-73.953174
22
9/1/16 8:22
9/16/16 7:42
DOT
11210
BROOKLYN
40.631530
-73.954573
23
9/1/16 17:31
9/16/16 10:10
DOT
11231
BROOKLYN
40.677629
-73.998190
24
9/1/16 8:50
9/16/16 11:12
DOT
11210
BROOKLYN
40.628450
-73.947200
25
9/1/16 14:19
9/16/16 14:11
DOT
11101
QUEENS
40.745866
-73.955148
26
9/1/16 12:46
9/16/16 11:53
DOT
11217
BROOKLYN
40.687518
-73.986320
27
9/1/16 13:33
9/16/16 12:55
DOT
11231
BROOKLYN
40.683742
-73.995233
28
9/1/16 13:51
9/11/16 2:05
HPD
10457
BRONX
40.852417
-73.899753
30
9/1/16 12:34
9/10/16 15:01
HPD
10033
MANHATTAN
40.846318
-73.932740
31
9/1/16 7:03
9/7/16 7:30
DEP
11209
BROOKLYN
40.636674
-74.036517
32
9/1/16 12:29
9/11/16 2:03
HPD
10452
BRONX
40.839529
-73.922534
33
9/1/16 17:32
9/16/16 7:20
DOT
11201
BROOKLYN
40.694792
-73.994299
34
9/1/16 8:53
9/16/16 9:03
DOT
11201
BROOKLYN
40.687137
-73.990174
35
9/1/16 17:36
9/16/16 9:47
DOT
11210
BROOKLYN
40.631101
-73.952534
36
9/1/16 17:46
9/16/16 9:49
DOT
11367
QUEENS
40.729686
-73.815145
37
9/1/16 13:12
9/2/16 12:00
DSNY
10029
MANHATTAN
40.787104
-73.945002
38
9/1/16 9:05
9/8/16 12:00
DSNY
10021
MANHATTAN
40.768324
-73.955137
39
9/1/16 12:00
9/16/16 11:48
DOT
10028
MANHATTAN
40.774136
-73.951363
40
9/1/16 12:19
9/16/16 7:20
DOT
10034
MANHATTAN
40.867919
-73.920454
41
9/1/16 9:57
9/16/16 9:01
DOT
11217
BROOKLYN
40.687773
-73.987001
...
...
...
...
...
...
...
...
971027
9/20/16 10:35
10/6/16 11:41
DOF
10019
MANHATTAN
NaN
NaN
971028
9/20/16 9:14
9/20/16 12:53
DOF
11201
BROOKLYN
NaN
NaN
971030
9/20/16 16:27
9/21/16 9:38
DOF
10466
BRONX
NaN
NaN
971031
9/20/16 16:13
9/21/16 11:00
DOT
10462
BRONX
40.835874
-73.848426
971032
9/20/16 13:08
9/28/16 21:55
DOT
10468
BRONX
40.863033
-73.909054
971034
9/20/16 13:36
9/20/16 16:14
NYPD
11357
QUEENS
40.789638
-73.814111
971035
9/20/16 21:22
9/21/16 10:21
DOT
11368
QUEENS
40.756870
-73.857055
971036
9/20/16 12:27
9/21/16 0:30
DOT
10461
BRONX
40.838262
-73.836541
971038
9/20/16 13:55
9/22/16 8:10
DEP
11217
BROOKLYN
40.677542
-73.979681
971039
9/20/16 13:25
9/21/16 10:59
DOF
10013
MANHATTAN
NaN
NaN
971040
9/20/16 14:08
9/21/16 14:08
DEP
11238
BROOKLYN
40.673232
-73.965010
971041
9/20/16 7:51
9/21/16 11:46
DOT
11222
BROOKLYN
40.725309
-73.957521
971043
9/20/16 8:56
9/20/16 10:20
DEP
11209
BROOKLYN
40.632837
-74.027230
971044
9/20/16 6:22
9/23/16 12:45
DEP
10029
MANHATTAN
40.787172
-73.949982
971045
9/20/16 3:40
NaN
DEP
11366
QUEENS
40.720328
-73.807761
971046
9/20/16 0:06
9/22/16 22:45
DEP
10027
MANHATTAN
40.808013
-73.950129
971047
9/20/16 3:05
9/20/16 7:58
NYPD
10030
MANHATTAN
40.816358
-73.940220
971049
9/20/16 20:42
1/13/17 8:56
DOT
10314
STATEN ISLAND
40.611248
-74.169591
971051
9/20/16 11:47
9/16/16 11:47
DOT
11414
QUEENS
40.648070
-73.844823
971052
9/20/16 18:31
9/20/16 20:42
NYPD
11205
BROOKLYN
40.694865
-73.979214
971053
9/20/16 19:35
9/20/16 21:39
NYPD
11205
BROOKLYN
40.694865
-73.979214
971055
9/20/16 10:19
9/21/16 5:09
DHS
10025
MANHATTAN
40.796495
-73.970431
971056
9/20/16 17:01
9/20/16 21:34
NYPD
10024
MANHATTAN
40.788312
-73.976502
971057
9/20/16 4:11
9/20/16 11:30
DEP
10014
MANHATTAN
40.734051
-74.004247
971058
9/20/16 9:42
9/20/16 11:11
DOF
10024
MANHATTAN
NaN
NaN
971059
9/20/16 15:02
9/24/16 9:33
HPD
10469
BRONX
40.877527
-73.847712
971060
9/20/16 11:36
10/21/16 20:02
HPD
10029
MANHATTAN
40.791709
-73.937064
971061
9/20/16 16:41
9/24/16 9:41
HPD
10460
BRONX
40.833706
-73.895943
971062
9/20/16 11:36
10/21/16 20:02
HPD
10029
MANHATTAN
40.791709
-73.937064
971063
9/20/16 13:41
10/11/16 2:04
HPD
11432
QUEENS
40.711279
-73.788951
910907 rows × 7 columns
In [14]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 910907 entries, 11 to 971063
Data columns (total 7 columns):
Created Date 910907 non-null object
Closed Date 829453 non-null object
Agency 910907 non-null object
Incident Zip 910907 non-null object
Borough 910907 non-null object
Latitude 887168 non-null float64
Longitude 887168 non-null float64
dtypes: float64(2), object(5)
memory usage: 55.6+ MB
In [15]:
data = data[(data['Latitude'].notnull()) & (data['Longitude'].notnull()) & (data['Closed Date'].notnull())]
In [16]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 806561 entries, 11 to 971063
Data columns (total 7 columns):
Created Date 806561 non-null object
Closed Date 806561 non-null object
Agency 806561 non-null object
Incident Zip 806561 non-null object
Borough 806561 non-null object
Latitude 806561 non-null float64
Longitude 806561 non-null float64
dtypes: float64(2), object(5)
memory usage: 49.2+ MB
In [17]:
data['Borough'].unique()
Out[17]:
array(['MANHATTAN', 'QUEENS', 'BROOKLYN', 'BRONX', 'STATEN ISLAND',
'Unspecified'], dtype=object)
In [18]:
data[data['Borough']=='Unspecified'][['Agency','Incident Zip']]
Out[18]:
Agency
Incident Zip
Unique Key
28854
NYPD
10312
29525
NYPD
11368
29559
NYPD
11422
29607
NYPD
10454
48565
NYPD
11209
48572
NYPD
11226
48657
NYPD
11225
48743
NYPD
11373
48781
NYPD
10467
48940
DPR
10454
49258
NYPD
11204
49264
NYPD
10039
49299
NYPD
11369
49471
NYPD
10454
49700
NYPD
11217
49919
NYPD
11367
50081
NYPD
11217
50542
NYPD
11234
50865
NYPD
10452
51202
NYPD
11364
72977
NYPD
10014
73118
NYPD
11385
73119
NYPD
10453
73181
NYPD
11101
73241
NYPD
11203
73286
NYPD
11432
73323
NYPD
11234
73433
NYPD
10461
73434
NYPD
11210
73499
NYPD
10314
...
...
...
188498
NYPD
10028
188500
NYPD
11217
188505
NYPD
10040
188506
NYPD
10029
188507
NYPD
10014
188521
NYPD
11208
188536
DHS
10023
188542
TLC
10023
188543
DHS
10023
188544
TLC
11230
188545
DHS
10017
188553
DHS
10024
188558
DHS
10016
188597
NYPD
11206
188599
NYPD
11373
188627
NYPD
11417
188628
NYPD
11418
188633
NYPD
10023
188634
NYPD
10472
188636
NYPD
11236
188639
NYPD
11366
188640
NYPD
10001
188642
NYPD
11237
188647
NYPD
10312
188648
NYPD
11223
188649
NYPD
11373
188650
NYPD
10468
188651
NYPD
11218
214971
NYPD
11233
235626
NYPD
11249
819 rows × 2 columns
In [19]:
data[data['Borough']=='Unspecified'].groupby('Agency').count()
Out[19]:
Created Date
Closed Date
Incident Zip
Borough
Latitude
Longitude
Agency
3/1/01
1
1
1
1
1
1
DHS
67
67
67
67
67
67
DOE
1
1
1
1
1
1
DOF
3
3
3
3
3
3
DOT
13
13
13
13
13
13
DPR
2
2
2
2
2
2
FDNY
1
1
1
1
1
1
NYPD
725
725
725
725
725
725
TLC
6
6
6
6
6
6
In [20]:
nypd_complaints_total = data[data['Agency']=='NYPD']['Borough'].count()
nypd_unspecified = data[(data['Borough']=='Unspecified') & (data['Agency']=="NYPD")]['Borough'].count()
percentage = nypd_unspecified/nypd_complaints_total*100
print("%1.2f"%percentage)
0.26
In [21]:
data = data[data['Borough'] != 'Unspecified']
In [22]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 805742 entries, 11 to 971063
Data columns (total 7 columns):
Created Date 805742 non-null object
Closed Date 805742 non-null object
Agency 805742 non-null object
Incident Zip 805742 non-null object
Borough 805742 non-null object
Latitude 805742 non-null float64
Longitude 805742 non-null float64
dtypes: float64(2), object(5)
memory usage: 49.2+ MB
In [23]:
data['Created Date']
Out[23]:
Unique Key
11 9/1/16 0:33
12 9/1/16 20:16
13 9/1/16 12:17
14 9/1/16 12:10
15 9/1/16 12:32
16 9/1/16 20:16
17 9/1/16 8:35
18 9/1/16 13:19
19 9/1/16 11:00
20 9/1/16 11:45
21 9/1/16 10:11
22 9/1/16 8:22
23 9/1/16 17:31
24 9/1/16 8:50
25 9/1/16 14:19
26 9/1/16 12:46
27 9/1/16 13:33
28 9/1/16 13:51
30 9/1/16 12:34
31 9/1/16 7:03
32 9/1/16 12:29
33 9/1/16 17:32
34 9/1/16 8:53
35 9/1/16 17:36
36 9/1/16 17:46
37 9/1/16 13:12
38 9/1/16 9:05
39 9/1/16 12:00
40 9/1/16 12:19
41 9/1/16 9:57
...
971014 9/20/16 20:01
971018 9/20/16 22:14
971021 9/18/16 11:07
971022 9/20/16 11:02
971023 9/19/16 16:31
971025 9/20/16 19:09
971031 9/20/16 16:13
971032 9/20/16 13:08
971034 9/20/16 13:36
971035 9/20/16 21:22
971036 9/20/16 12:27
971038 9/20/16 13:55
971040 9/20/16 14:08
971041 9/20/16 7:51
971043 9/20/16 8:56
971044 9/20/16 6:22
971046 9/20/16 0:06
971047 9/20/16 3:05
971049 9/20/16 20:42
971051 9/20/16 11:47
971052 9/20/16 18:31
971053 9/20/16 19:35
971055 9/20/16 10:19
971056 9/20/16 17:01
971057 9/20/16 4:11
971059 9/20/16 15:02
971060 9/20/16 11:36
971061 9/20/16 16:41
971062 9/20/16 11:36
971063 9/20/16 13:41
Name: Created Date, Length: 805742, dtype: object
In [24]:
import datetime
#data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%y %H:%M'))
In [25]:
data['Created Date'][0:20]
Out[25]:
Unique Key
11 2016-09-01 00:33:00
12 2016-09-01 20:16:00
13 2016-09-01 12:17:00
14 2016-09-01 12:10:00
15 2016-09-01 12:32:00
16 2016-09-01 20:16:00
17 2016-09-01 08:35:00
18 2016-09-01 13:19:00
19 2016-09-01 11:00:00
20 2016-09-01 11:45:00
21 2016-09-01 10:11:00
22 2016-09-01 08:22:00
23 2016-09-01 17:31:00
24 2016-09-01 08:50:00
25 2016-09-01 14:19:00
26 2016-09-01 12:46:00
27 2016-09-01 13:33:00
28 2016-09-01 13:51:00
30 2016-09-01 12:34:00
31 2016-09-01 07:03:00
Name: Created Date, dtype: datetime64[ns]
In [26]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 805742 entries, 11 to 971063
Data columns (total 7 columns):
Created Date 805742 non-null datetime64[ns]
Closed Date 805742 non-null object
Agency 805742 non-null object
Incident Zip 805742 non-null object
Borough 805742 non-null object
Latitude 805742 non-null float64
Longitude 805742 non-null float64
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 69.2+ MB
In [27]:
data['Closed Date'] = data['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%y %H:%M'))
In [28]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 805742 entries, 11 to 971063
Data columns (total 7 columns):
Created Date 805742 non-null datetime64[ns]
Closed Date 805742 non-null datetime64[ns]
Agency 805742 non-null object
Incident Zip 805742 non-null object
Borough 805742 non-null object
Latitude 805742 non-null float64
Longitude 805742 non-null float64
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 69.2+ MB
In [29]:
# pandas 需要再0.20.3才可以. 0.19.x 是会报错.
data['processing_time'] = data['Closed Date'] - data['Created Date']
In [30]:
#And look at summary statistics
data['processing_time'].describe()
Out[30]:
count 805742
mean 5 days 00:05:12.826909
std 12 days 06:08:18.124834
min -134 days +00:00:00
25% 0 days 02:35:00
50% 0 days 21:11:00
75% 4 days 14:30:00
max 148 days 13:11:00
Name: processing_time, dtype: object
In [31]:
data[data['processing_time']<datetime.timedelta(0,0,0)]
Out[31]:
Created Date
Closed Date
Agency
Incident Zip
Borough
Latitude
Longitude
processing_time
Unique Key
3972
2016-09-16 14:24:00
2016-09-15 14:23:00
DOT
10314
STATEN ISLAND
40.597868
-74.140537
-2 days +23:59:00
5223
2016-09-20 14:03:00
2016-09-16 14:03:00
DOT
11220
BROOKLYN
40.630682
-74.010970
-4 days +00:00:00
6084
2016-10-20 11:24:00
2016-10-19 01:24:00
DOT
11412
QUEENS
40.696186
-73.751966
-2 days +14:00:00
6085
2016-10-20 16:40:00
2016-10-19 16:39:00
DOT
10306
STATEN ISLAND
40.580343
-74.103262
-2 days +23:59:00
6086
2016-10-20 12:19:00
2016-10-19 12:18:00
DOT
11209
BROOKLYN
40.634865
-74.026381
-2 days +23:59:00
6115
2016-10-20 12:05:00
2016-10-19 02:05:00
DOT
11208
BROOKLYN
40.681095
-73.873586
-2 days +14:00:00
6396
2016-10-24 10:37:00
2016-10-21 10:37:00
DOT
11691
QUEENS
40.608713
-73.747670
-3 days +00:00:00
6682
2016-10-31 10:26:00
2016-10-28 10:26:00
DOT
11417
QUEENS
40.676871
-73.840344
-3 days +00:00:00
6694
2016-10-31 10:46:00
2016-10-27 10:46:00
DOT
11362
QUEENS
40.765202
-73.738088
-4 days +00:00:00
6819
2016-09-20 11:49:00
2016-09-16 11:49:00
DOT
11432
QUEENS
40.703220
-73.802559
-4 days +00:00:00
6820
2016-09-20 14:16:00
2016-09-16 14:16:00
DOT
11238
BROOKLYN
40.680797
-73.958397
-4 days +00:00:00
6830
2016-09-20 14:34:00
2016-09-16 14:34:00
DOT
11201
BROOKLYN
40.692277
-73.988187
-4 days +00:00:00
6836
2016-09-20 14:04:00
2016-09-16 14:00:00
DOT
11220
BROOKLYN
40.649360
-74.012833
-5 days +23:56:00
6840
2016-09-20 12:00:00
2016-09-16 12:00:00
DOT
11434
QUEENS
40.660911
-73.769906
-4 days +00:00:00
6841
2016-09-20 11:48:00
2016-09-16 11:48:00
DOT
11432
QUEENS
40.703220
-73.802559
-4 days +00:00:00
6842
2016-09-20 11:43:00
2016-09-16 11:43:00
DOT
11416
QUEENS
40.685665
-73.852531
-4 days +00:00:00
6843
2016-09-20 14:23:00
2016-09-16 14:23:00
DOT
11221
BROOKLYN
40.695352
-73.916314
-4 days +00:00:00
6845
2016-09-20 11:48:00
2016-09-16 11:48:00
DOT
11432
QUEENS
40.704671
-73.805649
-4 days +00:00:00
6866
2016-09-20 14:03:00
2016-09-16 14:03:00
DOT
11219
BROOKLYN
40.635497
-74.001956
-4 days +00:00:00
6867
2016-09-20 11:59:00
2016-09-16 11:59:00
DOT
11433
QUEENS
40.696159
-73.786607
-4 days +00:00:00
6868
2016-09-20 11:54:00
2016-09-16 11:54:00
DOT
11374
QUEENS
40.728658
-73.863260
-4 days +00:00:00
6869
2016-09-20 12:53:00
2016-09-16 12:53:00
DOT
11224
BROOKLYN
40.576982
-73.981565
-4 days +00:00:00
6877
2016-09-20 14:51:00
2016-09-19 14:50:00
DOT
11225
BROOKLYN
40.663183
-73.962449
-2 days +23:59:00
6885
2016-09-20 12:00:00
2016-09-16 12:00:00
DOT
11412
QUEENS
40.707737
-73.763191
-4 days +00:00:00
6890
2016-09-20 11:49:00
2016-09-16 11:49:00
DOT
11432
QUEENS
40.705233
-73.803573
-4 days +00:00:00
6891
2016-09-20 11:41:00
2016-09-16 11:41:00
DOT
11105
QUEENS
40.773510
-73.899912
-4 days +00:00:00
6893
2016-09-20 12:50:00
2016-09-16 12:50:00
DOT
11212
BROOKLYN
40.668709
-73.917307
-4 days +00:00:00
6895
2016-09-20 14:32:00
2016-09-16 14:32:00
DOT
11217
BROOKLYN
40.681505
-73.974228
-4 days +00:00:00
6896
2016-09-20 12:54:00
2016-09-16 12:54:00
DOT
11214
BROOKLYN
40.588321
-73.985220
-4 days +00:00:00
6899
2016-09-20 12:31:00
2016-09-17 22:40:00
DOT
10469
BRONX
40.858725
-73.833436
-3 days +10:09:00
...
...
...
...
...
...
...
...
...
957010
2016-10-17 00:00:00
2016-09-12 00:00:00
DOHMH
10025
MANHATTAN
40.798563
-73.963405
-35 days +00:00:00
957015
2016-10-17 00:00:00
2016-09-13 00:00:00
DOHMH
10037
MANHATTAN
40.808726
-73.937893
-34 days +00:00:00
957080
2016-10-17 00:00:00
2016-10-11 00:00:00
DOHMH
10461
BRONX
40.847411
-73.849288
-6 days +00:00:00
957095
2016-10-17 00:00:00
2016-10-03 00:00:00
DOHMH
10026
MANHATTAN
40.804663
-73.951919
-14 days +00:00:00
957167
2016-10-17 00:00:00
2016-10-03 00:00:00
DOHMH
11385
QUEENS
40.706613
-73.868167
-14 days +00:00:00
957189
2016-10-17 00:00:00
2016-10-05 00:00:00
DOHMH
10027
MANHATTAN
40.808263
-73.956266
-12 days +00:00:00
958744
2016-09-19 16:54:00
2016-09-17 00:26:00
DOT
10303
STATEN ISLAND
40.631594
-74.160388
-3 days +07:32:00
958745
2016-09-19 12:18:00
2016-09-17 00:15:00
DOT
10465
BRONX
40.814894
-73.803528
-3 days +11:57:00
958746
2016-09-19 12:05:00
2016-09-16 22:52:00
DOT
10460
BRONX
40.834040
-73.895271
-3 days +10:47:00
958747
2016-09-19 16:53:00
2016-09-16 22:57:00
DOT
10306
STATEN ISLAND
40.583297
-74.099911
-3 days +06:04:00
958748
2016-09-19 11:52:00
2016-09-16 11:52:00
DOT
10474
BRONX
40.810851
-73.881858
-3 days +00:00:00
958752
2016-09-20 12:01:00
2016-09-16 12:01:00
DOT
11434
QUEENS
40.679960
-73.782481
-4 days +00:00:00
958753
2016-09-20 14:12:00
2016-09-16 14:12:00
DOT
11226
BROOKLYN
40.650388
-73.958708
-4 days +00:00:00
958758
2016-09-20 11:53:00
2016-09-16 11:53:00
DOT
11694
QUEENS
40.572042
-73.857548
-4 days +00:00:00
958759
2016-09-20 14:33:00
2016-09-16 14:33:00
DOT
11201
BROOKLYN
40.691194
-73.997768
-4 days +00:00:00
958760
2016-09-20 14:31:00
2016-09-16 14:31:00
DOT
11215
BROOKLYN
40.666159
-73.995307
-4 days +00:00:00
958804
2016-09-22 12:15:00
2016-09-21 12:14:00
DOT
11233
BROOKLYN
40.682427
-73.937931
-2 days +23:59:00
958858
2016-09-26 16:23:00
2016-09-23 23:00:00
DOT
10308
STATEN ISLAND
40.545127
-74.139858
-3 days +06:37:00
958859
2016-09-26 13:31:00
2016-09-23 21:20:00
DOT
10474
BRONX
40.809629
-73.890035
-3 days +07:49:00
958860
2016-09-26 12:39:00
2016-09-23 12:39:00
DOT
11232
BROOKLYN
40.653856
-74.008167
-3 days +00:00:00
958861
2016-09-26 12:28:00
2016-09-23 12:28:00
DOT
11211
BROOKLYN
40.705900
-73.963099
-3 days +00:00:00
965521
2016-09-13 12:32:00
2016-09-09 12:32:00
DOT
11691
QUEENS
40.600204
-73.745044
-4 days +00:00:00
965781
2016-09-13 11:51:00
2016-09-08 11:51:00
DOT
11416
QUEENS
40.689714
-73.840767
-5 days +00:00:00
965843
2016-09-13 12:03:00
2016-09-08 12:03:00
DOT
11691
QUEENS
40.595140
-73.753987
-5 days +00:00:00
967705
2016-09-15 16:52:00
2016-09-14 01:20:00
DOT
10309
STATEN ISLAND
40.521280
-74.239458
-2 days +08:28:00
968181
2016-09-15 13:59:00
2016-09-14 13:58:00
DOT
10473
BRONX
40.819163
-73.879813
-2 days +23:59:00
969890
2016-09-08 12:33:00
2016-09-06 12:33:00
DOT
11219
BROOKLYN
40.641815
-73.995813
-2 days +00:00:00
970636
2016-09-12 12:24:00
2016-09-09 22:28:00
DOT
10314
STATEN ISLAND
40.598331
-74.127146
-3 days +10:04:00
971003
2016-09-20 14:19:00
2016-09-16 14:19:00
DOT
11207
BROOKLYN
40.690455
-73.911748
-4 days +00:00:00
971051
2016-09-20 11:47:00
2016-09-16 11:47:00
DOT
11414
QUEENS
40.648070
-73.844823
-4 days +00:00:00
6417 rows × 8 columns
In [34]:
data[data['processing_time']>datetime.timedelta(140,0,0)]
Out[34]:
Created Date
Closed Date
Agency
Incident Zip
Borough
Latitude
Longitude
processing_time
Unique Key
5371
2016-09-01 12:00:00
2017-01-24 00:00:00
DOB
11105
QUEENS
40.770843
-73.904511
144 days 12:00:00
28919
2016-09-02 21:11:00
2017-01-27 00:00:00
DOB
11385
QUEENS
40.706241
-73.860424
146 days 02:49:00
155601
2016-09-01 19:53:00
2017-01-27 11:59:00
DPR
11417
QUEENS
40.677149
-73.860489
147 days 16:06:00
155747
2016-09-01 19:24:00
2017-01-24 00:00:00
DOB
11385
QUEENS
40.702376
-73.887810
144 days 04:36:00
155818
2016-09-01 10:49:00
2017-01-28 00:00:00
DOB
11691
QUEENS
40.597741
-73.775975
148 days 13:11:00
156081
2016-09-01 19:34:00
2017-01-25 08:04:00
DPR
11210
BROOKLYN
40.629988
-73.948792
145 days 12:30:00
156134
2016-09-01 15:57:00
2017-01-25 08:05:00
DPR
11234
BROOKLYN
40.617043
-73.919579
145 days 16:08:00
156523
2016-09-01 09:04:00
2017-01-27 14:12:00
DOT
11357
QUEENS
40.791344
-73.827361
148 days 05:08:00
157320
2016-09-01 12:49:00
2017-01-23 12:00:00
DPR
11205
BROOKLYN
40.689642
-73.953477
143 days 23:11:00
158319
2016-09-01 17:07:00
2017-01-23 00:00:00
DOB
11223
BROOKLYN
40.596909
-73.981448
143 days 06:53:00
168301
2016-09-05 14:26:00
2017-01-27 10:26:00
DPR
11354
QUEENS
40.772271
-73.806820
143 days 20:00:00
172151
2016-09-02 11:39:00
2017-01-25 00:00:00
DOB
11356
QUEENS
40.778338
-73.847936
144 days 12:21:00
172709
2016-09-02 11:40:00
2017-01-25 00:00:00
DOB
11354
QUEENS
40.776631
-73.848008
144 days 12:20:00
172761
2016-09-02 15:24:00
2017-01-23 00:00:00
DOB
11235
BROOKLYN
40.581568
-73.958553
142 days 08:36:00
173841
2016-09-02 11:38:00
2017-01-24 10:02:00
DPR
11228
BROOKLYN
40.615314
-74.006804
143 days 22:24:00
174256
2016-09-02 14:07:00
2017-01-28 00:00:00
DOB
11373
QUEENS
40.736084
-73.871790
147 days 09:53:00
194295
2016-09-02 15:33:00
2017-01-25 08:04:00
DPR
11234
BROOKLYN
40.624334
-73.917466
144 days 16:31:00
194296
2016-09-02 15:47:00
2017-01-24 00:00:00
DOB
11356
QUEENS
40.788745
-73.853007
143 days 08:13:00
223853
2016-09-03 16:22:00
2017-01-24 10:01:00
DPR
11223
BROOKLYN
40.602380
-73.982466
142 days 17:39:00
224063
2016-09-03 10:34:00
2017-01-25 00:00:00
DOB
11220
BROOKLYN
40.648797
-74.011917
143 days 13:26:00
279023
2016-09-07 14:05:00
2017-01-26 17:01:00
DPR
11225
BROOKLYN
40.658557
-73.952496
141 days 02:56:00
279371
2016-09-07 14:17:00
2017-01-26 17:01:00
DPR
11225
BROOKLYN
40.658587
-73.952262
141 days 02:44:00
279375
2016-09-07 12:13:00
2017-01-27 15:01:00
DPR
11226
BROOKLYN
40.647595
-73.948241
142 days 02:48:00
290214
2016-09-04 13:06:00
2017-01-27 08:36:00
DOT
11435
QUEENS
40.712113
-73.819291
144 days 19:30:00
299493
2016-09-02 14:08:00
2017-01-23 07:13:00
DOT
11385
QUEENS
40.709934
-73.897872
142 days 17:05:00
299505
2016-09-03 15:38:00
2017-01-26 16:02:00
DPR
10458
BRONX
40.859428
-73.891113
145 days 00:24:00
303524
2016-09-01 04:13:00
2017-01-22 00:00:00
DOB
11413
QUEENS
40.656629
-73.753756
142 days 19:47:00
338097
2016-09-01 13:26:00
2017-01-24 00:00:00
DOB
11379
QUEENS
40.722387
-73.875947
144 days 10:34:00
338103
2016-09-03 10:42:00
2017-01-24 00:00:00
DOB
11220
BROOKLYN
40.648707
-74.011766
142 days 13:18:00
338107
2016-09-07 13:49:00
2017-01-28 14:01:00
DPR
11210
BROOKLYN
40.620744
-73.949580
143 days 00:12:00
370926
2016-09-05 09:22:00
2017-01-27 08:59:00
DPR
11366
QUEENS
40.732338
-73.776576
143 days 23:37:00
431150
2016-09-05 00:46:00
2017-01-23 11:16:00
DOT
11233
BROOKLYN
40.678364
-73.908137
140 days 10:30:00
470712
2016-09-03 14:08:00
2017-01-25 00:00:00
DOB
11354
QUEENS
40.773811
-73.847050
143 days 09:52:00
472657
2016-09-02 08:46:00
2017-01-24 18:00:00
DPR
11238
BROOKLYN
40.681132
-73.967349
144 days 09:14:00
472768
2016-09-02 17:08:00
2017-01-27 08:01:00
DPR
11214
BROOKLYN
40.589295
-73.984219
146 days 14:53:00
473340
2016-09-02 14:35:00
2017-01-27 08:02:00
DPR
11223
BROOKLYN
40.588090
-73.967877
146 days 17:27:00
473358
2016-09-02 17:07:00
2017-01-27 08:02:00
DPR
11214
BROOKLYN
40.589295
-73.984568
146 days 14:55:00
473549
2016-09-02 09:31:00
2017-01-25 08:01:00
DPR
11226
BROOKLYN
40.639232
-73.962131
144 days 22:30:00
477723
2016-09-06 16:16:00
2017-01-28 14:02:00
DPR
11210
BROOKLYN
40.621882
-73.954623
143 days 21:46:00
478807
2016-09-06 11:59:00
2017-01-28 15:01:00
DPR
11103
QUEENS
40.763953
-73.918272
144 days 03:02:00
478939
2016-09-06 17:30:00
2017-01-27 00:00:00
DOB
11235
BROOKLYN
40.585533
-73.938302
142 days 06:30:00
521571
2016-09-07 00:55:00
2017-01-25 08:01:00
DPR
11234
BROOKLYN
40.618109
-73.914095
140 days 07:06:00
522055
2016-09-06 09:42:00
2017-01-27 00:00:00
DOB
11235
BROOKLYN
40.587920
-73.940532
142 days 14:18:00
533600
2016-09-07 19:36:00
2017-01-26 10:16:00
DOT
11412
QUEENS
40.692284
-73.752510
140 days 14:40:00
558640
2016-09-08 13:36:00
2017-01-29 14:00:00
DPR
11370
QUEENS
40.755721
-73.893939
143 days 00:24:00
569755
2016-09-08 09:48:00
2017-01-28 15:01:00
DPR
11204
BROOKLYN
40.630970
-73.983910
142 days 05:13:00
614925
2016-09-06 15:30:00
2017-01-25 16:22:00
DPR
11412
QUEENS
40.701398
-73.758305
141 days 00:52:00
681064
2016-09-01 23:31:00
2017-01-24 14:11:00
DOT
10469
BRONX
40.864486
-73.843389
144 days 14:40:00
711401
2016-09-07 09:57:00
2017-01-27 00:00:00
DOB
11373
QUEENS
40.734269
-73.883574
141 days 14:03:00
836666
2016-09-10 12:25:00
2017-01-29 00:00:00
DOB
11423
QUEENS
40.721838
-73.774479
140 days 11:35:00
In [35]:
data = data[data['processing_time']>=datetime.timedelta(0,0,0)]
In [36]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 799325 entries, 11 to 971063
Data columns (total 8 columns):
Created Date 799325 non-null datetime64[ns]
Closed Date 799325 non-null datetime64[ns]
Agency 799325 non-null object
Incident Zip 799325 non-null object
Borough 799325 non-null object
Latitude 799325 non-null float64
Longitude 799325 non-null float64
processing_time 799325 non-null timedelta64[ns]
dtypes: datetime64[ns](2), float64(2), object(3), timedelta64[ns](1)
memory usage: 54.9+ MB
In [ ]:
def read_311_data(datafile):
import pandas as pd
import numpy as np
#Add the fix_zip function
def fix_zip(input_zip):
try:
input_zip = int(float(input_zip))
except:
try:
input_zip = int(input_zip.split('-')[0])
except:
return np.NaN
if input_zip < 10000 or input_zip > 19999:
return np.NaN
return str(input_zip)
#Read the file
df = pd.read_csv(datafile,index_col='Unique Key')
#fix the zip
df['Incident Zip'] = df['Incident Zip'].apply(fix_zip)
#drop all rows that have any nans in them (note the easier syntax!)
df = df.dropna(how='any')
#get rid of unspecified boroughs
df = df[df['Borough'] != 'Unspecified']
#Convert times to datetime and create a processing time column
import datetime
df['Created Date'] = df['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
df['Closed Date'] = df['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
df['processing_time'] = df['Closed Date'] - df['Created Date']
#Finally, get rid of negative processing times and return the final data frame
df = df[df['processing_time']>=datetime.timedelta(0,0,0)]
return df
In [ ]:
df = read_311_data('nyc_311_data_subset.csv')
df.info()
In [ ]:
Content source: KECB/learn
Similar notebooks: