In [1]:
datafile = "nyc_311_data_subset-2.csv"

In [2]:
import pandas as pd
import numpy as np

read_csv: A pandas function that reads a comma separated file

read_csv will try to format the data so that it is the correct type and will report any typing problems
It will also look for a header row.
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html


In [3]:
data = pd.read_csv(datafile)
data


/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
Unique Key Created Date Closed Date Agency Incident Zip Borough Latitude Longitude
0 1 10/11/16 11:53 10/11/16 12:00 DSNY NaN QUEENS NaN NaN
1 2 10/11/16 11:36 10/11/16 12:00 DSNY NaN QUEENS NaN NaN
2 3 10/11/16 11:36 10/11/16 12:00 DSNY NaN QUEENS NaN NaN
3 4 10/11/16 12:39 10/11/16 12:39 DSNY NaN QUEENS NaN NaN
4 5 10/11/16 12:18 10/11/16 12:18 DSNY NaN QUEENS NaN NaN
5 6 10/11/16 11:36 10/11/16 12:00 DSNY NaN QUEENS NaN NaN
6 7 10/11/16 12:39 10/11/16 12:39 DSNY NaN QUEENS NaN NaN
7 8 10/11/16 11:44 NaN DSNY NaN QUEENS NaN NaN
8 9 10/11/16 12:35 10/11/16 12:35 DSNY NaN QUEENS NaN NaN
9 10 10/11/16 11:26 5/27/16 12:00 DSNY NaN QUEENS NaN NaN
10 11 9/1/16 0:33 9/16/16 1:06 DCA 10001 MANHATTAN 40.744790 -73.988834
11 12 9/1/16 20:16 9/10/16 18:08 HPD 11691 QUEENS 40.600554 -73.750704
12 13 9/1/16 12:17 9/7/16 12:00 DSNY 11211 BROOKLYN 40.704925 -73.962007
13 14 9/1/16 12:10 9/10/16 14:23 HPD 10027 MANHATTAN 40.812322 -73.955338
14 15 9/1/16 12:32 9/11/16 2:03 HPD 10452 BRONX 40.839529 -73.922534
15 16 9/1/16 20:16 9/10/16 18:08 HPD 11691 QUEENS 40.600554 -73.750704
16 17 9/1/16 8:35 9/7/16 12:00 DSNY 11428 QUEENS 40.721866 -73.745982
17 18 9/1/16 13:19 9/16/16 14:32 DOT 11101 QUEENS 40.746875 -73.952711
18 19 9/1/16 11:00 9/8/16 12:00 DSNY 10075 MANHATTAN 40.773336 -73.955054
19 20 9/1/16 11:45 9/3/16 12:00 DSNY 11215 BROOKLYN 40.662002 -73.983668
20 21 9/1/16 10:11 9/16/16 11:11 DOT 11101 QUEENS 40.745492 -73.953174
21 22 9/1/16 8:22 9/16/16 7:42 DOT 11210 BROOKLYN 40.631530 -73.954573
22 23 9/1/16 17:31 9/16/16 10:10 DOT 11231 BROOKLYN 40.677629 -73.998190
23 24 9/1/16 8:50 9/16/16 11:12 DOT 11210 BROOKLYN 40.628450 -73.947200
24 25 9/1/16 14:19 9/16/16 14:11 DOT 11101 QUEENS 40.745866 -73.955148
25 26 9/1/16 12:46 9/16/16 11:53 DOT 11217 BROOKLYN 40.687518 -73.986320
26 27 9/1/16 13:33 9/16/16 12:55 DOT 11231 BROOKLYN 40.683742 -73.995233
27 28 9/1/16 13:51 9/11/16 2:05 HPD 10457 BRONX 40.852417 -73.899753
28 29 9/1/16 12:28 9/1/16 12:28 DOT NaN BRONX NaN NaN
29 30 9/1/16 12:34 9/10/16 15:01 HPD 10033 MANHATTAN 40.846318 -73.932740
... ... ... ... ... ... ... ... ...
971033 971034 9/20/16 13:36 9/20/16 16:14 NYPD 11357 QUEENS 40.789638 -73.814111
971034 971035 9/20/16 21:22 9/21/16 10:21 DOT 11368 QUEENS 40.756870 -73.857055
971035 971036 9/20/16 12:27 9/21/16 0:30 DOT 10461 BRONX 40.838262 -73.836541
971036 971037 9/20/16 14:25 9/26/16 13:57 DOF NaN Unspecified NaN NaN
971037 971038 9/20/16 13:55 9/22/16 8:10 DEP 11217 BROOKLYN 40.677542 -73.979681
971038 971039 9/20/16 13:25 9/21/16 10:59 DOF 10013 MANHATTAN NaN NaN
971039 971040 9/20/16 14:08 9/21/16 14:08 DEP 11238 BROOKLYN 40.673232 -73.965010
971040 971041 9/20/16 7:51 9/21/16 11:46 DOT 11222 BROOKLYN 40.725309 -73.957521
971041 971042 9/20/16 19:57 9/20/16 19:58 HRA NaN Unspecified NaN NaN
971042 971043 9/20/16 8:56 9/20/16 10:20 DEP 11209 BROOKLYN 40.632837 -74.027230
971043 971044 9/20/16 6:22 9/23/16 12:45 DEP 10029 MANHATTAN 40.787172 -73.949982
971044 971045 9/20/16 3:40 NaN DEP 11366 QUEENS 40.720328 -73.807761
971045 971046 9/20/16 0:06 9/22/16 22:45 DEP 10027 MANHATTAN 40.808013 -73.950129
971046 971047 9/20/16 3:05 9/20/16 7:58 NYPD 10030 MANHATTAN 40.816358 -73.940220
971047 971048 9/20/16 9:19 9/27/16 15:20 DOT NaN BROOKLYN NaN NaN
971048 971049 9/20/16 20:42 1/13/17 8:56 DOT 10314 STATEN ISLAND 40.611248 -74.169591
971049 971050 9/20/16 16:42 9/20/16 18:40 DOT NaN QUEENS NaN NaN
971050 971051 9/20/16 11:47 9/16/16 11:47 DOT 11414 QUEENS 40.648070 -73.844823
971051 971052 9/20/16 18:31 9/20/16 20:42 NYPD 11205 BROOKLYN 40.694865 -73.979214
971052 971053 9/20/16 19:35 9/20/16 21:39 NYPD 11205 BROOKLYN 40.694865 -73.979214
971053 971054 9/20/16 8:10 9/23/16 11:28 DOT NaN QUEENS NaN NaN
971054 971055 9/20/16 10:19 9/21/16 5:09 DHS 10025 MANHATTAN 40.796495 -73.970431
971055 971056 9/20/16 17:01 9/20/16 21:34 NYPD 10024 MANHATTAN 40.788312 -73.976502
971056 971057 9/20/16 4:11 9/20/16 11:30 DEP 10014 MANHATTAN 40.734051 -74.004247
971057 971058 9/20/16 9:42 9/20/16 11:11 DOF 10024 MANHATTAN NaN NaN
971058 971059 9/20/16 15:02 9/24/16 9:33 HPD 10469 BRONX 40.877527 -73.847712
971059 971060 9/20/16 11:36 10/21/16 20:02 HPD 10029 MANHATTAN 40.791709 -73.937064
971060 971061 9/20/16 16:41 9/24/16 9:41 HPD 10460 BRONX 40.833706 -73.895943
971061 971062 9/20/16 11:36 10/21/16 20:02 HPD 10029 MANHATTAN 40.791709 -73.937064
971062 971063 9/20/16 13:41 10/11/16 2:04 HPD 11432 QUEENS 40.711279 -73.788951

971063 rows × 8 columns

Let's examine our data


In [4]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 971063 entries, 0 to 971062
Data columns (total 8 columns):
Unique Key      971063 non-null int64
Created Date    971063 non-null object
Closed Date     882944 non-null object
Agency          971063 non-null object
Incident Zip    911140 non-null object
Borough         971063 non-null object
Latitude        887284 non-null float64
Longitude       887284 non-null float64
dtypes: float64(2), int64(1), object(5)
memory usage: 59.3+ MB

Looks like Unique Key really is a unique key and can serve as an index


In [5]:
data = pd.read_csv(datafile,index_col='Unique Key')


/Users/Beck/anaconda/envs/tflearn/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [6]:
data.iloc[1:10]


Out[6]:
Created Date Closed Date Agency Incident Zip Borough Latitude Longitude
Unique Key
2 10/11/16 11:36 10/11/16 12:00 DSNY NaN QUEENS NaN NaN
3 10/11/16 11:36 10/11/16 12:00 DSNY NaN QUEENS NaN NaN
4 10/11/16 12:39 10/11/16 12:39 DSNY NaN QUEENS NaN NaN
5 10/11/16 12:18 10/11/16 12:18 DSNY NaN QUEENS NaN NaN
6 10/11/16 11:36 10/11/16 12:00 DSNY NaN QUEENS NaN NaN
7 10/11/16 12:39 10/11/16 12:39 DSNY NaN QUEENS NaN NaN
8 10/11/16 11:44 NaN DSNY NaN QUEENS NaN NaN
9 10/11/16 12:35 10/11/16 12:35 DSNY NaN QUEENS NaN NaN
10 10/11/16 11:26 5/27/16 12:00 DSNY NaN QUEENS NaN NaN

Columns 4 has mixed types

Column 4 is incident zip

Let's examine it
The unique() function returns unique values in a column


In [7]:
data['Incident Zip'].unique()


Out[7]:
array([nan, '10001', '11691', '11211', '10027', '10452', '11428', '11101',
       '10075', '11215', '11210', '11231', '11217', '10457', '10033',
       '11209', '11201', '11367', '10029', '10021', '10028', '10034',
       '10032', '10039', '11414', '10461', '11229', '10462', '11223',
       '10023', '10453', '11225', '11219', '10451', '11234', '10014',
       '11354', '11361', '10468', '11233', '10466', '11204', '11413',
       '11224', '11375', '11040', '11232', '11203', '11205', '11434',
       '10011', '10003', '10025', '10013', '10036', '11237', '11355',
       '11368', '10454', '10456', '10463', '11222', '11228', '11216',
       '10128', '11435', '11419', '11358', '11421', '10019', '11238',
       '11213', '11235', '11420', '10038', '11226', '10472', '10016',
       '11221', '11236', '11436', '11214', '11377', '11385', '11365',
       '10312', '11426', '11373', '11218', '10005', '11230', '10026',
       '10473', '10280', '10301', '10309', '10310', '10009', '10002',
       '11433', '10020', '11357', '10030', '11378', '11249', '11432',
       '11212', '10024', '10035', '11429', '11206', '11372', '10471',
       '10119', '10307', '11364', '11103', '10017', '10012', '11105',
       '10458', '10018', '11374', '10459', '10314', '10037', '10302',
       '10040', '11411', '11692', '10303', '11418', '10031', '11220',
       '11427', '10465', '10306', '10010', '10460', '10305', '11207',
       '11208', '10474', '11417', '10475', '10455', '11416', '10065',
       '11363', '11693', '10308', '11356', '10469', '11369', '10470',
       '10467', '10007', '10304', '11366', '11694', '11102', '11423',
       '11422', '19044', '11412', '10022', '11379', '11251', '30374-0283',
       '11004', '11104', '10004', '11362', '11360', '11109', '7047',
       '11590', '11001', '11430', '11106', '10464', '43216', '11370',
       '10271', '11239', '11415', '10006', '10121', '10177', '11030-3409',
       '10069', '7302', '0', '10044', '10282', '10000', '11743', '10178',
       '7114', '7410', '13851', '10169', '11697', '10168', '10538',
       '10123', '11516', '10174', '10278', '11598', '10112', '10605',
       'UNKNOWN', '10172', '11570', '7105', '11561', '11576', '10162',
       '10281', '10151', '220194', '99999', '83', '44087', '11030',
       '11725', '10111', '11005', '11803', '90274', '11520', '7112',
       '10041', '10167', '10107', '10120', '7073', '10154', '10165',
       '11550', '11701', '33134', '7628', '10532', '10703', '11371',
       '10105', '18773', '11580', '11725-9030', '10158', '7722', '11779',
       '11359', '10153', '10604', '10279', '29601', '30006', '11792',
       '11747', '10118', '11566', '11776', '12344', '10103', '10801',
       '11509', '10701', '10913', '11518', '11002', '10573', '10601',
       '11581', '11710', '42101', '10803', '11050', '3358', '8876',
       '10179', '12222', '11003', '10110', 'UNKOWN', '11636', '98057',
       '10550', '7310', '10115', '60179', '7093', '7306', '10170', '61702',
       '10804', '11791', '7013', '11042', '7024', '29616-0759', '14094',
       '11040-4204', '55433', '33033', '11501', '11514', '11021',
       '02241-5789', '91711', '11735', '7104', '48393-0824', '30353-0949',
       '10173', '2298', '33433', '33309', '10106', '10045', '7101', '7307',
       '1581', '11244', '7086', '?', '11530', 11433.0, 11213.0, 10037.0,
       10030.0, 10462.0, 10035.0, 10458.0, 11377.0, 11691.0, 10036.0,
       10002.0, 11355.0, 11419.0, 11209.0, 10460.0, 11226.0, 10011.0,
       11233.0, 10033.0, 10009.0, 10463.0, 10459.0, 10031.0, 10034.0,
       10472.0, 11372.0, 10025.0, 10301.0, 11357.0, 11216.0, 11230.0,
       10302.0, 10452.0, 11234.0, 10003.0, 10453.0, 10456.0, 10473.0,
       11238.0, 10027.0, 10451.0, 11218.0, 11208.0, 11220.0, 10032.0,
       11366.0, 11214.0, 11236.0, 11223.0, 10303.0, 11207.0, 11378.0,
       10466.0, 11212.0, 11219.0, 10028.0, 11201.0, 11373.0, 10023.0,
       10468.0, 11105.0, 11222.0, 10304.0, 11413.0, 10467.0, 10039.0,
       11420.0, 10457.0, 10016.0, 11414.0, 11104.0, 11221.0, 10040.0,
       11435.0, 11385.0, 10026.0, 11368.0, 11215.0, 11412.0, 11001.0,
       10010.0, 10306.0, 11101.0, 11415.0, 11375.0, 10465.0, 10024.0,
       10021.0, 11237.0, 10014.0, 11229.0, 11217.0, 10310.0, 10461.0,
       10012.0, 11356.0, 11228.0, 11205.0, 10001.0, 11210.0, 10455.0,
       11411.0, 10005.0, 11418.0, 11232.0, 10471.0, 10029.0, 10308.0,
       11225.0, 10019.0, 11422.0, 10006.0, 11224.0, 11354.0, 11249.0,
       11434.0, 10282.0, 11374.0, 10128.0, 10013.0, 11367.0, 10065.0,
       11231.0, 11211.0, 11432.0, 11692.0, 11235.0, 11204.0, 11361.0,
       10017.0, 11103.0, 10020.0, 10312.0, 11365.0, 10018.0, 11206.0,
       11416.0, 10305.0, 11370.0, 11203.0, 11379.0, 11369.0, 10469.0,
       10314.0, 10075.0, 11436.0, 11429.0, 10022.0, 11428.0, 10470.0,
       10474.0, 11358.0, 11102.0, 11427.0, 10454.0, 10309.0, 11421.0,
       11423.0, 11106.0, 11362.0, 10307.0, 10464.0, 11360.0, 10038.0,
       11426.0, 10112.0, 11239.0, 11004.0, 11417.0, 11693.0, 11364.0,
       11021.0, 7114.0, 11694.0, 10475.0, 10007.0, 10280.0, 10069.0,
       8861.0, 10279.0, 10044.0, 8817.0, 10004.0, 11363.0, 11005.0,
       11697.0, 10281.0, 10111.0, 11040.0, 11430.0, 10118.0, 10121.0,
       7302.0, 11251.0, 10803.0, 10119.0, 10580.0, 11109.0, 11749.0,
       19053.0, 10708.0, 10165.0, 10122.0, 10176.0, 10000.0, 10178.0,
       11520.0, 10583.0, 10048.0, 10168.0, 10177.0, 83.0, 8901.0, 10110.0,
       10169.0, 7310.0, 8512.0, 97062.0, 10167.0, 11743.0, 11371.0,
       80014.0, 11722.0, 10155.0, 11554.0, 10123.0, 6870.0, 10103.0,
       12222.0, 10041.0, 10106.0, 7086.0, 1749.0, 11042.0, 11719.0,
       11024.0, 11801.0, 10162.0, 7208.0, 97114.0, 11571.0, 10278.0,
       19034.0, 11695.0, 11968.0, 11010.0, 0.0, '10707', '10155', '10580',
       '11241', '11553', '10176', '11510', '11802-9060', '11757', '19040',
       '11758', '11746', '30346', '11695', '11575', '11572', '10960',
       '10710', '18901', '42301', '32752-9963', '10705', '11557', '3830',
       '10134', '7029', '10055', '10704', '11558', '7960', '12209', '7712',
       '11797', '14624', '10048', '14222', '19962', '19399', '2346',
       '11756-0215', '11111', '7608', '10530', '07103-4154', '11801',
       '10583', '11793', '10166', 'NJ 07114', '6601', '10023-6349',
       '10805', '877', '7660', '11563', '9153', '14231-9054', '11729',
       '11559', '11431', '7747', '7040', '11579', '10152', '92131',
       '19123', '90054-0807', '55486-0102', '37909', '10543', '11706',
       '7030', '91942', '14450', '7090', '12550', '12553', '10567',
       '10259', '11552', '92193', '1757', '11554', '10989', '11219-0283',
       '7070', '37214', '17108-0988', '43218', '2114', '10553',
       '43218-2118', '32256', '7640', '7087', 11561.0, 10154.0, 11359.0,
       11530.0, 10977.0, 10173.0, 7108.0, 10271.0, 6902.0, 10120.0,
       10153.0, 10158.0, 11590.0, 99999.0, 7030.0, 10591.0, 8043.0,
       10174.0, 105457.0, 10604.0, 7304.0, 10573.0, '10801-6393', '20705',
       '33128', '14228', '11582', '10008', '7094', '11577', '7201',
       '10122', '29615', '19047', '2062', '12205', '11704', '11242',
       '11797-9001', '10129', 'JFK', '1853', '11768'], dtype=object)

Some issues

  • Sometimes zip is a float, other times it is a str
  • Zipcodes that are represented as floats and start with 0 are missing the first digit
  • Some zipcodes have the 4 digit extension added. Comparison becomes tough
  • What the heck is zip 0?
  • What about the missing (nan) values? The ? (question mark)? "UNKNOWN"?
  • The first step in data cleaning is to:

    Decide what to do with "bad" data ("JFK", "UNKNOWN", etc.). Convert to Nan or delete the record.

    Make sure all data in a column is in the correct format (convert floats to strings, get rid of the 4 digit extension)

    Decide what to do with missing values (NaNs)

    for "Incident Zip"

    we'll drop rows with NaN or bad data

    get rid of the 4 digit extension

    remove zips less than 10000 and greater than 19999

    Let's write a function that fixes zips

    
    
    In [ ]:
    
    
    
    
    In [8]:
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0])
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 19999:
            return np.NaN
        return str(input_zip)
    

    And test it

    
    
    In [9]:
    fix_zip('11211.00')
    
    
    
    
    Out[9]:
    '11211'

    Next, we'll apply this function to every element in input zip to get a revised column

    The pandas function "apply" applies a function to a dataframe column

  • fix_zip will be applied to each element of the Incident Zip column and we replace the existing column with the modified one
  • 
    
    In [10]:
    data['Incident Zip'] = data['Incident Zip'].apply(fix_zip)
    
    
    
    In [11]:
    data['Incident Zip'].unique()
    
    
    
    
    Out[11]:
    array([nan, '10001', '11691', '11211', '10027', '10452', '11428', '11101',
           '10075', '11215', '11210', '11231', '11217', '10457', '10033',
           '11209', '11201', '11367', '10029', '10021', '10028', '10034',
           '10032', '10039', '11414', '10461', '11229', '10462', '11223',
           '10023', '10453', '11225', '11219', '10451', '11234', '10014',
           '11354', '11361', '10468', '11233', '10466', '11204', '11413',
           '11224', '11375', '11040', '11232', '11203', '11205', '11434',
           '10011', '10003', '10025', '10013', '10036', '11237', '11355',
           '11368', '10454', '10456', '10463', '11222', '11228', '11216',
           '10128', '11435', '11419', '11358', '11421', '10019', '11238',
           '11213', '11235', '11420', '10038', '11226', '10472', '10016',
           '11221', '11236', '11436', '11214', '11377', '11385', '11365',
           '10312', '11426', '11373', '11218', '10005', '11230', '10026',
           '10473', '10280', '10301', '10309', '10310', '10009', '10002',
           '11433', '10020', '11357', '10030', '11378', '11249', '11432',
           '11212', '10024', '10035', '11429', '11206', '11372', '10471',
           '10119', '10307', '11364', '11103', '10017', '10012', '11105',
           '10458', '10018', '11374', '10459', '10314', '10037', '10302',
           '10040', '11411', '11692', '10303', '11418', '10031', '11220',
           '11427', '10465', '10306', '10010', '10460', '10305', '11207',
           '11208', '10474', '11417', '10475', '10455', '11416', '10065',
           '11363', '11693', '10308', '11356', '10469', '11369', '10470',
           '10467', '10007', '10304', '11366', '11694', '11102', '11423',
           '11422', '19044', '11412', '10022', '11379', '11251', '11004',
           '11104', '10004', '11362', '11360', '11109', '11590', '11001',
           '11430', '11106', '10464', '11370', '10271', '11239', '11415',
           '10006', '10121', '10177', '11030', '10069', '10044', '10282',
           '10000', '11743', '10178', '13851', '10169', '11697', '10168',
           '10538', '10123', '11516', '10174', '10278', '11598', '10112',
           '10605', '10172', '11570', '11561', '11576', '10162', '10281',
           '10151', '11725', '10111', '11005', '11803', '11520', '10041',
           '10167', '10107', '10120', '10154', '10165', '11550', '11701',
           '10532', '10703', '11371', '10105', '18773', '11580', '10158',
           '11779', '11359', '10153', '10604', '10279', '11792', '11747',
           '10118', '11566', '11776', '12344', '10103', '10801', '11509',
           '10701', '10913', '11518', '11002', '10573', '10601', '11581',
           '11710', '10803', '11050', '10179', '12222', '11003', '10110',
           '11636', '10550', '10115', '10170', '10804', '11791', '11042',
           '14094', '11501', '11514', '11021', '11735', '10173', '10106',
           '10045', '11244', '11530', '10580', '11749', '19053', '10708',
           '10122', '10176', '10583', '10048', '11722', '10155', '11554',
           '11719', '11024', '11801', '11571', '19034', '11695', '11968',
           '11010', '10707', '11241', '11553', '11510', '11802', '11757',
           '19040', '11758', '11746', '11575', '11572', '10960', '10710',
           '18901', '10705', '11557', '10134', '10055', '10704', '11558',
           '12209', '11797', '14624', '14222', '19962', '19399', '11756',
           '11111', '10530', '11793', '10166', '10805', '11563', '14231',
           '11729', '11559', '11431', '11579', '10152', '19123', '10543',
           '11706', '14450', '12550', '12553', '10567', '10259', '11552',
           '10989', '17108', '10553', '10977', '10591', '14228', '11582',
           '10008', '11577', '19047', '12205', '11704', '11242', '10129',
           '11768'], dtype=object)

    Finally, we'll get rid of all rows that have zip == Nan

  • We don't have to, that's just a choice we're making
  • 
    
    In [12]:
    data = data[data['Incident Zip'].notnull()]
    
    
    
    In [13]:
    data
    
    
    
    
    Out[13]:
    Created Date Closed Date Agency Incident Zip Borough Latitude Longitude
    Unique Key
    11 9/1/16 0:33 9/16/16 1:06 DCA 10001 MANHATTAN 40.744790 -73.988834
    12 9/1/16 20:16 9/10/16 18:08 HPD 11691 QUEENS 40.600554 -73.750704
    13 9/1/16 12:17 9/7/16 12:00 DSNY 11211 BROOKLYN 40.704925 -73.962007
    14 9/1/16 12:10 9/10/16 14:23 HPD 10027 MANHATTAN 40.812322 -73.955338
    15 9/1/16 12:32 9/11/16 2:03 HPD 10452 BRONX 40.839529 -73.922534
    16 9/1/16 20:16 9/10/16 18:08 HPD 11691 QUEENS 40.600554 -73.750704
    17 9/1/16 8:35 9/7/16 12:00 DSNY 11428 QUEENS 40.721866 -73.745982
    18 9/1/16 13:19 9/16/16 14:32 DOT 11101 QUEENS 40.746875 -73.952711
    19 9/1/16 11:00 9/8/16 12:00 DSNY 10075 MANHATTAN 40.773336 -73.955054
    20 9/1/16 11:45 9/3/16 12:00 DSNY 11215 BROOKLYN 40.662002 -73.983668
    21 9/1/16 10:11 9/16/16 11:11 DOT 11101 QUEENS 40.745492 -73.953174
    22 9/1/16 8:22 9/16/16 7:42 DOT 11210 BROOKLYN 40.631530 -73.954573
    23 9/1/16 17:31 9/16/16 10:10 DOT 11231 BROOKLYN 40.677629 -73.998190
    24 9/1/16 8:50 9/16/16 11:12 DOT 11210 BROOKLYN 40.628450 -73.947200
    25 9/1/16 14:19 9/16/16 14:11 DOT 11101 QUEENS 40.745866 -73.955148
    26 9/1/16 12:46 9/16/16 11:53 DOT 11217 BROOKLYN 40.687518 -73.986320
    27 9/1/16 13:33 9/16/16 12:55 DOT 11231 BROOKLYN 40.683742 -73.995233
    28 9/1/16 13:51 9/11/16 2:05 HPD 10457 BRONX 40.852417 -73.899753
    30 9/1/16 12:34 9/10/16 15:01 HPD 10033 MANHATTAN 40.846318 -73.932740
    31 9/1/16 7:03 9/7/16 7:30 DEP 11209 BROOKLYN 40.636674 -74.036517
    32 9/1/16 12:29 9/11/16 2:03 HPD 10452 BRONX 40.839529 -73.922534
    33 9/1/16 17:32 9/16/16 7:20 DOT 11201 BROOKLYN 40.694792 -73.994299
    34 9/1/16 8:53 9/16/16 9:03 DOT 11201 BROOKLYN 40.687137 -73.990174
    35 9/1/16 17:36 9/16/16 9:47 DOT 11210 BROOKLYN 40.631101 -73.952534
    36 9/1/16 17:46 9/16/16 9:49 DOT 11367 QUEENS 40.729686 -73.815145
    37 9/1/16 13:12 9/2/16 12:00 DSNY 10029 MANHATTAN 40.787104 -73.945002
    38 9/1/16 9:05 9/8/16 12:00 DSNY 10021 MANHATTAN 40.768324 -73.955137
    39 9/1/16 12:00 9/16/16 11:48 DOT 10028 MANHATTAN 40.774136 -73.951363
    40 9/1/16 12:19 9/16/16 7:20 DOT 10034 MANHATTAN 40.867919 -73.920454
    41 9/1/16 9:57 9/16/16 9:01 DOT 11217 BROOKLYN 40.687773 -73.987001
    ... ... ... ... ... ... ... ...
    971027 9/20/16 10:35 10/6/16 11:41 DOF 10019 MANHATTAN NaN NaN
    971028 9/20/16 9:14 9/20/16 12:53 DOF 11201 BROOKLYN NaN NaN
    971030 9/20/16 16:27 9/21/16 9:38 DOF 10466 BRONX NaN NaN
    971031 9/20/16 16:13 9/21/16 11:00 DOT 10462 BRONX 40.835874 -73.848426
    971032 9/20/16 13:08 9/28/16 21:55 DOT 10468 BRONX 40.863033 -73.909054
    971034 9/20/16 13:36 9/20/16 16:14 NYPD 11357 QUEENS 40.789638 -73.814111
    971035 9/20/16 21:22 9/21/16 10:21 DOT 11368 QUEENS 40.756870 -73.857055
    971036 9/20/16 12:27 9/21/16 0:30 DOT 10461 BRONX 40.838262 -73.836541
    971038 9/20/16 13:55 9/22/16 8:10 DEP 11217 BROOKLYN 40.677542 -73.979681
    971039 9/20/16 13:25 9/21/16 10:59 DOF 10013 MANHATTAN NaN NaN
    971040 9/20/16 14:08 9/21/16 14:08 DEP 11238 BROOKLYN 40.673232 -73.965010
    971041 9/20/16 7:51 9/21/16 11:46 DOT 11222 BROOKLYN 40.725309 -73.957521
    971043 9/20/16 8:56 9/20/16 10:20 DEP 11209 BROOKLYN 40.632837 -74.027230
    971044 9/20/16 6:22 9/23/16 12:45 DEP 10029 MANHATTAN 40.787172 -73.949982
    971045 9/20/16 3:40 NaN DEP 11366 QUEENS 40.720328 -73.807761
    971046 9/20/16 0:06 9/22/16 22:45 DEP 10027 MANHATTAN 40.808013 -73.950129
    971047 9/20/16 3:05 9/20/16 7:58 NYPD 10030 MANHATTAN 40.816358 -73.940220
    971049 9/20/16 20:42 1/13/17 8:56 DOT 10314 STATEN ISLAND 40.611248 -74.169591
    971051 9/20/16 11:47 9/16/16 11:47 DOT 11414 QUEENS 40.648070 -73.844823
    971052 9/20/16 18:31 9/20/16 20:42 NYPD 11205 BROOKLYN 40.694865 -73.979214
    971053 9/20/16 19:35 9/20/16 21:39 NYPD 11205 BROOKLYN 40.694865 -73.979214
    971055 9/20/16 10:19 9/21/16 5:09 DHS 10025 MANHATTAN 40.796495 -73.970431
    971056 9/20/16 17:01 9/20/16 21:34 NYPD 10024 MANHATTAN 40.788312 -73.976502
    971057 9/20/16 4:11 9/20/16 11:30 DEP 10014 MANHATTAN 40.734051 -74.004247
    971058 9/20/16 9:42 9/20/16 11:11 DOF 10024 MANHATTAN NaN NaN
    971059 9/20/16 15:02 9/24/16 9:33 HPD 10469 BRONX 40.877527 -73.847712
    971060 9/20/16 11:36 10/21/16 20:02 HPD 10029 MANHATTAN 40.791709 -73.937064
    971061 9/20/16 16:41 9/24/16 9:41 HPD 10460 BRONX 40.833706 -73.895943
    971062 9/20/16 11:36 10/21/16 20:02 HPD 10029 MANHATTAN 40.791709 -73.937064
    971063 9/20/16 13:41 10/11/16 2:04 HPD 11432 QUEENS 40.711279 -73.788951

    910907 rows × 7 columns

    Let's take a look at the columns again

    
    
    In [14]:
    data.info()
    
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 910907 entries, 11 to 971063
    Data columns (total 7 columns):
    Created Date    910907 non-null object
    Closed Date     829453 non-null object
    Agency          910907 non-null object
    Incident Zip    910907 non-null object
    Borough         910907 non-null object
    Latitude        887168 non-null float64
    Longitude       887168 non-null float64
    dtypes: float64(2), object(5)
    memory usage: 55.6+ MB
    

    Closed Data, Latitude and Longitude all have missing values

    Let's get rid of them

    
    
    In [15]:
    data = data[(data['Latitude'].notnull()) & (data['Longitude'].notnull())  & (data['Closed Date'].notnull())]
    
    
    
    In [16]:
    data.info()
    
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 806561 entries, 11 to 971063
    Data columns (total 7 columns):
    Created Date    806561 non-null object
    Closed Date     806561 non-null object
    Agency          806561 non-null object
    Incident Zip    806561 non-null object
    Borough         806561 non-null object
    Latitude        806561 non-null float64
    Longitude       806561 non-null float64
    dtypes: float64(2), object(5)
    memory usage: 49.2+ MB
    

    Let's take a look at Borough data

    
    
    In [17]:
    data['Borough'].unique()
    
    
    
    
    Out[17]:
    array(['MANHATTAN', 'QUEENS', 'BROOKLYN', 'BRONX', 'STATEN ISLAND',
           'Unspecified'], dtype=object)

    Let's look at 'Unspecified'

    
    
    In [18]:
    data[data['Borough']=='Unspecified'][['Agency','Incident Zip']]
    
    
    
    
    Out[18]:
    Agency Incident Zip
    Unique Key
    28854 NYPD 10312
    29525 NYPD 11368
    29559 NYPD 11422
    29607 NYPD 10454
    48565 NYPD 11209
    48572 NYPD 11226
    48657 NYPD 11225
    48743 NYPD 11373
    48781 NYPD 10467
    48940 DPR 10454
    49258 NYPD 11204
    49264 NYPD 10039
    49299 NYPD 11369
    49471 NYPD 10454
    49700 NYPD 11217
    49919 NYPD 11367
    50081 NYPD 11217
    50542 NYPD 11234
    50865 NYPD 10452
    51202 NYPD 11364
    72977 NYPD 10014
    73118 NYPD 11385
    73119 NYPD 10453
    73181 NYPD 11101
    73241 NYPD 11203
    73286 NYPD 11432
    73323 NYPD 11234
    73433 NYPD 10461
    73434 NYPD 11210
    73499 NYPD 10314
    ... ... ...
    188498 NYPD 10028
    188500 NYPD 11217
    188505 NYPD 10040
    188506 NYPD 10029
    188507 NYPD 10014
    188521 NYPD 11208
    188536 DHS 10023
    188542 TLC 10023
    188543 DHS 10023
    188544 TLC 11230
    188545 DHS 10017
    188553 DHS 10024
    188558 DHS 10016
    188597 NYPD 11206
    188599 NYPD 11373
    188627 NYPD 11417
    188628 NYPD 11418
    188633 NYPD 10023
    188634 NYPD 10472
    188636 NYPD 11236
    188639 NYPD 11366
    188640 NYPD 10001
    188642 NYPD 11237
    188647 NYPD 10312
    188648 NYPD 11223
    188649 NYPD 11373
    188650 NYPD 10468
    188651 NYPD 11218
    214971 NYPD 11233
    235626 NYPD 11249

    819 rows × 2 columns

    Looks like a lot of these are NYPD related

    Let's take a closer look

    
    
    In [19]:
    data[data['Borough']=='Unspecified'].groupby('Agency').count()
    
    
    
    
    Out[19]:
    Created Date Closed Date Incident Zip Borough Latitude Longitude
    Agency
    3/1/01 1 1 1 1 1 1
    DHS 67 67 67 67 67 67
    DOE 1 1 1 1 1 1
    DOF 3 3 3 3 3 3
    DOT 13 13 13 13 13 13
    DPR 2 2 2 2 2 2
    FDNY 1 1 1 1 1 1
    NYPD 725 725 725 725 725 725
    TLC 6 6 6 6 6 6

    Unspecified appears to have a systematic bias toward NYPD

    Though only a small proportion of NYPD complaints (see below)

    We have to decide whether to keep them or lose them!

    
    
    In [20]:
    nypd_complaints_total = data[data['Agency']=='NYPD']['Borough'].count()
    nypd_unspecified = data[(data['Borough']=='Unspecified') & (data['Agency']=="NYPD")]['Borough'].count()
    percentage = nypd_unspecified/nypd_complaints_total*100
    print("%1.2f"%percentage)
    
    
    
    
    0.26
    

    For now, we'll get rid of them. Unspecified will be hard to explain!

    
    
    In [21]:
    data = data[data['Borough'] != 'Unspecified']
    
    
    
    In [22]:
    data.info()
    
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 805742 entries, 11 to 971063
    Data columns (total 7 columns):
    Created Date    805742 non-null object
    Closed Date     805742 non-null object
    Agency          805742 non-null object
    Incident Zip    805742 non-null object
    Borough         805742 non-null object
    Latitude        805742 non-null float64
    Longitude       805742 non-null float64
    dtypes: float64(2), object(5)
    memory usage: 49.2+ MB
    

    Dealing with time

  • Dates and times are best converted to datetime
  • That way they will be useful for analysis because we can compute timedelta objects
  • 
    
    In [23]:
    data['Created Date']
    
    
    
    
    Out[23]:
    Unique Key
    11          9/1/16 0:33
    12         9/1/16 20:16
    13         9/1/16 12:17
    14         9/1/16 12:10
    15         9/1/16 12:32
    16         9/1/16 20:16
    17          9/1/16 8:35
    18         9/1/16 13:19
    19         9/1/16 11:00
    20         9/1/16 11:45
    21         9/1/16 10:11
    22          9/1/16 8:22
    23         9/1/16 17:31
    24          9/1/16 8:50
    25         9/1/16 14:19
    26         9/1/16 12:46
    27         9/1/16 13:33
    28         9/1/16 13:51
    30         9/1/16 12:34
    31          9/1/16 7:03
    32         9/1/16 12:29
    33         9/1/16 17:32
    34          9/1/16 8:53
    35         9/1/16 17:36
    36         9/1/16 17:46
    37         9/1/16 13:12
    38          9/1/16 9:05
    39         9/1/16 12:00
    40         9/1/16 12:19
    41          9/1/16 9:57
                  ...      
    971014    9/20/16 20:01
    971018    9/20/16 22:14
    971021    9/18/16 11:07
    971022    9/20/16 11:02
    971023    9/19/16 16:31
    971025    9/20/16 19:09
    971031    9/20/16 16:13
    971032    9/20/16 13:08
    971034    9/20/16 13:36
    971035    9/20/16 21:22
    971036    9/20/16 12:27
    971038    9/20/16 13:55
    971040    9/20/16 14:08
    971041     9/20/16 7:51
    971043     9/20/16 8:56
    971044     9/20/16 6:22
    971046     9/20/16 0:06
    971047     9/20/16 3:05
    971049    9/20/16 20:42
    971051    9/20/16 11:47
    971052    9/20/16 18:31
    971053    9/20/16 19:35
    971055    9/20/16 10:19
    971056    9/20/16 17:01
    971057     9/20/16 4:11
    971059    9/20/16 15:02
    971060    9/20/16 11:36
    971061    9/20/16 16:41
    971062    9/20/16 11:36
    971063    9/20/16 13:41
    Name: Created Date, Length: 805742, dtype: object
    
    
    In [24]:
    import datetime
    #data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
    data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%y %H:%M'))
    
    
    
    In [25]:
    data['Created Date'][0:20]
    
    
    
    
    Out[25]:
    Unique Key
    11   2016-09-01 00:33:00
    12   2016-09-01 20:16:00
    13   2016-09-01 12:17:00
    14   2016-09-01 12:10:00
    15   2016-09-01 12:32:00
    16   2016-09-01 20:16:00
    17   2016-09-01 08:35:00
    18   2016-09-01 13:19:00
    19   2016-09-01 11:00:00
    20   2016-09-01 11:45:00
    21   2016-09-01 10:11:00
    22   2016-09-01 08:22:00
    23   2016-09-01 17:31:00
    24   2016-09-01 08:50:00
    25   2016-09-01 14:19:00
    26   2016-09-01 12:46:00
    27   2016-09-01 13:33:00
    28   2016-09-01 13:51:00
    30   2016-09-01 12:34:00
    31   2016-09-01 07:03:00
    Name: Created Date, dtype: datetime64[ns]
    
    
    In [26]:
    data.info()
    
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 805742 entries, 11 to 971063
    Data columns (total 7 columns):
    Created Date    805742 non-null datetime64[ns]
    Closed Date     805742 non-null object
    Agency          805742 non-null object
    Incident Zip    805742 non-null object
    Borough         805742 non-null object
    Latitude        805742 non-null float64
    Longitude       805742 non-null float64
    dtypes: datetime64[ns](1), float64(2), object(4)
    memory usage: 69.2+ MB
    
    
    
    In [27]:
    data['Closed Date'] = data['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%y %H:%M'))
    
    
    
    In [28]:
    data.info()
    
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 805742 entries, 11 to 971063
    Data columns (total 7 columns):
    Created Date    805742 non-null datetime64[ns]
    Closed Date     805742 non-null datetime64[ns]
    Agency          805742 non-null object
    Incident Zip    805742 non-null object
    Borough         805742 non-null object
    Latitude        805742 non-null float64
    Longitude       805742 non-null float64
    dtypes: datetime64[ns](2), float64(2), object(3)
    memory usage: 69.2+ MB
    

    We can create a new column that tracks the time it takes to close a complaint

    
    
    In [29]:
    # pandas 需要再0.20.3才可以. 0.19.x 是会报错.
    data['processing_time'] =  data['Closed Date'] - data['Created Date']
    
    
    
    In [30]:
    #And look at summary statistics
    data['processing_time'].describe()
    
    
    
    
    Out[30]:
    count                     805742
    mean      5 days 00:05:12.826909
    std      12 days 06:08:18.124834
    min          -134 days +00:00:00
    25%              0 days 02:35:00
    50%              0 days 21:11:00
    75%              4 days 14:30:00
    max            148 days 13:11:00
    Name: processing_time, dtype: object

    There is some odd stuff here

  • Negative processing time?
  • Since our data is for two months, a max of 148 days worth checking out
  • Let's examine the negative processing time data

    
    
    In [31]:
    data[data['processing_time']<datetime.timedelta(0,0,0)]
    
    
    
    
    Out[31]:
    Created Date Closed Date Agency Incident Zip Borough Latitude Longitude processing_time
    Unique Key
    3972 2016-09-16 14:24:00 2016-09-15 14:23:00 DOT 10314 STATEN ISLAND 40.597868 -74.140537 -2 days +23:59:00
    5223 2016-09-20 14:03:00 2016-09-16 14:03:00 DOT 11220 BROOKLYN 40.630682 -74.010970 -4 days +00:00:00
    6084 2016-10-20 11:24:00 2016-10-19 01:24:00 DOT 11412 QUEENS 40.696186 -73.751966 -2 days +14:00:00
    6085 2016-10-20 16:40:00 2016-10-19 16:39:00 DOT 10306 STATEN ISLAND 40.580343 -74.103262 -2 days +23:59:00
    6086 2016-10-20 12:19:00 2016-10-19 12:18:00 DOT 11209 BROOKLYN 40.634865 -74.026381 -2 days +23:59:00
    6115 2016-10-20 12:05:00 2016-10-19 02:05:00 DOT 11208 BROOKLYN 40.681095 -73.873586 -2 days +14:00:00
    6396 2016-10-24 10:37:00 2016-10-21 10:37:00 DOT 11691 QUEENS 40.608713 -73.747670 -3 days +00:00:00
    6682 2016-10-31 10:26:00 2016-10-28 10:26:00 DOT 11417 QUEENS 40.676871 -73.840344 -3 days +00:00:00
    6694 2016-10-31 10:46:00 2016-10-27 10:46:00 DOT 11362 QUEENS 40.765202 -73.738088 -4 days +00:00:00
    6819 2016-09-20 11:49:00 2016-09-16 11:49:00 DOT 11432 QUEENS 40.703220 -73.802559 -4 days +00:00:00
    6820 2016-09-20 14:16:00 2016-09-16 14:16:00 DOT 11238 BROOKLYN 40.680797 -73.958397 -4 days +00:00:00
    6830 2016-09-20 14:34:00 2016-09-16 14:34:00 DOT 11201 BROOKLYN 40.692277 -73.988187 -4 days +00:00:00
    6836 2016-09-20 14:04:00 2016-09-16 14:00:00 DOT 11220 BROOKLYN 40.649360 -74.012833 -5 days +23:56:00
    6840 2016-09-20 12:00:00 2016-09-16 12:00:00 DOT 11434 QUEENS 40.660911 -73.769906 -4 days +00:00:00
    6841 2016-09-20 11:48:00 2016-09-16 11:48:00 DOT 11432 QUEENS 40.703220 -73.802559 -4 days +00:00:00
    6842 2016-09-20 11:43:00 2016-09-16 11:43:00 DOT 11416 QUEENS 40.685665 -73.852531 -4 days +00:00:00
    6843 2016-09-20 14:23:00 2016-09-16 14:23:00 DOT 11221 BROOKLYN 40.695352 -73.916314 -4 days +00:00:00
    6845 2016-09-20 11:48:00 2016-09-16 11:48:00 DOT 11432 QUEENS 40.704671 -73.805649 -4 days +00:00:00
    6866 2016-09-20 14:03:00 2016-09-16 14:03:00 DOT 11219 BROOKLYN 40.635497 -74.001956 -4 days +00:00:00
    6867 2016-09-20 11:59:00 2016-09-16 11:59:00 DOT 11433 QUEENS 40.696159 -73.786607 -4 days +00:00:00
    6868 2016-09-20 11:54:00 2016-09-16 11:54:00 DOT 11374 QUEENS 40.728658 -73.863260 -4 days +00:00:00
    6869 2016-09-20 12:53:00 2016-09-16 12:53:00 DOT 11224 BROOKLYN 40.576982 -73.981565 -4 days +00:00:00
    6877 2016-09-20 14:51:00 2016-09-19 14:50:00 DOT 11225 BROOKLYN 40.663183 -73.962449 -2 days +23:59:00
    6885 2016-09-20 12:00:00 2016-09-16 12:00:00 DOT 11412 QUEENS 40.707737 -73.763191 -4 days +00:00:00
    6890 2016-09-20 11:49:00 2016-09-16 11:49:00 DOT 11432 QUEENS 40.705233 -73.803573 -4 days +00:00:00
    6891 2016-09-20 11:41:00 2016-09-16 11:41:00 DOT 11105 QUEENS 40.773510 -73.899912 -4 days +00:00:00
    6893 2016-09-20 12:50:00 2016-09-16 12:50:00 DOT 11212 BROOKLYN 40.668709 -73.917307 -4 days +00:00:00
    6895 2016-09-20 14:32:00 2016-09-16 14:32:00 DOT 11217 BROOKLYN 40.681505 -73.974228 -4 days +00:00:00
    6896 2016-09-20 12:54:00 2016-09-16 12:54:00 DOT 11214 BROOKLYN 40.588321 -73.985220 -4 days +00:00:00
    6899 2016-09-20 12:31:00 2016-09-17 22:40:00 DOT 10469 BRONX 40.858725 -73.833436 -3 days +10:09:00
    ... ... ... ... ... ... ... ... ...
    957010 2016-10-17 00:00:00 2016-09-12 00:00:00 DOHMH 10025 MANHATTAN 40.798563 -73.963405 -35 days +00:00:00
    957015 2016-10-17 00:00:00 2016-09-13 00:00:00 DOHMH 10037 MANHATTAN 40.808726 -73.937893 -34 days +00:00:00
    957080 2016-10-17 00:00:00 2016-10-11 00:00:00 DOHMH 10461 BRONX 40.847411 -73.849288 -6 days +00:00:00
    957095 2016-10-17 00:00:00 2016-10-03 00:00:00 DOHMH 10026 MANHATTAN 40.804663 -73.951919 -14 days +00:00:00
    957167 2016-10-17 00:00:00 2016-10-03 00:00:00 DOHMH 11385 QUEENS 40.706613 -73.868167 -14 days +00:00:00
    957189 2016-10-17 00:00:00 2016-10-05 00:00:00 DOHMH 10027 MANHATTAN 40.808263 -73.956266 -12 days +00:00:00
    958744 2016-09-19 16:54:00 2016-09-17 00:26:00 DOT 10303 STATEN ISLAND 40.631594 -74.160388 -3 days +07:32:00
    958745 2016-09-19 12:18:00 2016-09-17 00:15:00 DOT 10465 BRONX 40.814894 -73.803528 -3 days +11:57:00
    958746 2016-09-19 12:05:00 2016-09-16 22:52:00 DOT 10460 BRONX 40.834040 -73.895271 -3 days +10:47:00
    958747 2016-09-19 16:53:00 2016-09-16 22:57:00 DOT 10306 STATEN ISLAND 40.583297 -74.099911 -3 days +06:04:00
    958748 2016-09-19 11:52:00 2016-09-16 11:52:00 DOT 10474 BRONX 40.810851 -73.881858 -3 days +00:00:00
    958752 2016-09-20 12:01:00 2016-09-16 12:01:00 DOT 11434 QUEENS 40.679960 -73.782481 -4 days +00:00:00
    958753 2016-09-20 14:12:00 2016-09-16 14:12:00 DOT 11226 BROOKLYN 40.650388 -73.958708 -4 days +00:00:00
    958758 2016-09-20 11:53:00 2016-09-16 11:53:00 DOT 11694 QUEENS 40.572042 -73.857548 -4 days +00:00:00
    958759 2016-09-20 14:33:00 2016-09-16 14:33:00 DOT 11201 BROOKLYN 40.691194 -73.997768 -4 days +00:00:00
    958760 2016-09-20 14:31:00 2016-09-16 14:31:00 DOT 11215 BROOKLYN 40.666159 -73.995307 -4 days +00:00:00
    958804 2016-09-22 12:15:00 2016-09-21 12:14:00 DOT 11233 BROOKLYN 40.682427 -73.937931 -2 days +23:59:00
    958858 2016-09-26 16:23:00 2016-09-23 23:00:00 DOT 10308 STATEN ISLAND 40.545127 -74.139858 -3 days +06:37:00
    958859 2016-09-26 13:31:00 2016-09-23 21:20:00 DOT 10474 BRONX 40.809629 -73.890035 -3 days +07:49:00
    958860 2016-09-26 12:39:00 2016-09-23 12:39:00 DOT 11232 BROOKLYN 40.653856 -74.008167 -3 days +00:00:00
    958861 2016-09-26 12:28:00 2016-09-23 12:28:00 DOT 11211 BROOKLYN 40.705900 -73.963099 -3 days +00:00:00
    965521 2016-09-13 12:32:00 2016-09-09 12:32:00 DOT 11691 QUEENS 40.600204 -73.745044 -4 days +00:00:00
    965781 2016-09-13 11:51:00 2016-09-08 11:51:00 DOT 11416 QUEENS 40.689714 -73.840767 -5 days +00:00:00
    965843 2016-09-13 12:03:00 2016-09-08 12:03:00 DOT 11691 QUEENS 40.595140 -73.753987 -5 days +00:00:00
    967705 2016-09-15 16:52:00 2016-09-14 01:20:00 DOT 10309 STATEN ISLAND 40.521280 -74.239458 -2 days +08:28:00
    968181 2016-09-15 13:59:00 2016-09-14 13:58:00 DOT 10473 BRONX 40.819163 -73.879813 -2 days +23:59:00
    969890 2016-09-08 12:33:00 2016-09-06 12:33:00 DOT 11219 BROOKLYN 40.641815 -73.995813 -2 days +00:00:00
    970636 2016-09-12 12:24:00 2016-09-09 22:28:00 DOT 10314 STATEN ISLAND 40.598331 -74.127146 -3 days +10:04:00
    971003 2016-09-20 14:19:00 2016-09-16 14:19:00 DOT 11207 BROOKLYN 40.690455 -73.911748 -4 days +00:00:00
    971051 2016-09-20 11:47:00 2016-09-16 11:47:00 DOT 11414 QUEENS 40.648070 -73.844823 -4 days +00:00:00

    6417 rows × 8 columns

    And the large processing times as well

    
    
    In [34]:
    data[data['processing_time']>datetime.timedelta(140,0,0)]
    
    
    
    
    Out[34]:
    Created Date Closed Date Agency Incident Zip Borough Latitude Longitude processing_time
    Unique Key
    5371 2016-09-01 12:00:00 2017-01-24 00:00:00 DOB 11105 QUEENS 40.770843 -73.904511 144 days 12:00:00
    28919 2016-09-02 21:11:00 2017-01-27 00:00:00 DOB 11385 QUEENS 40.706241 -73.860424 146 days 02:49:00
    155601 2016-09-01 19:53:00 2017-01-27 11:59:00 DPR 11417 QUEENS 40.677149 -73.860489 147 days 16:06:00
    155747 2016-09-01 19:24:00 2017-01-24 00:00:00 DOB 11385 QUEENS 40.702376 -73.887810 144 days 04:36:00
    155818 2016-09-01 10:49:00 2017-01-28 00:00:00 DOB 11691 QUEENS 40.597741 -73.775975 148 days 13:11:00
    156081 2016-09-01 19:34:00 2017-01-25 08:04:00 DPR 11210 BROOKLYN 40.629988 -73.948792 145 days 12:30:00
    156134 2016-09-01 15:57:00 2017-01-25 08:05:00 DPR 11234 BROOKLYN 40.617043 -73.919579 145 days 16:08:00
    156523 2016-09-01 09:04:00 2017-01-27 14:12:00 DOT 11357 QUEENS 40.791344 -73.827361 148 days 05:08:00
    157320 2016-09-01 12:49:00 2017-01-23 12:00:00 DPR 11205 BROOKLYN 40.689642 -73.953477 143 days 23:11:00
    158319 2016-09-01 17:07:00 2017-01-23 00:00:00 DOB 11223 BROOKLYN 40.596909 -73.981448 143 days 06:53:00
    168301 2016-09-05 14:26:00 2017-01-27 10:26:00 DPR 11354 QUEENS 40.772271 -73.806820 143 days 20:00:00
    172151 2016-09-02 11:39:00 2017-01-25 00:00:00 DOB 11356 QUEENS 40.778338 -73.847936 144 days 12:21:00
    172709 2016-09-02 11:40:00 2017-01-25 00:00:00 DOB 11354 QUEENS 40.776631 -73.848008 144 days 12:20:00
    172761 2016-09-02 15:24:00 2017-01-23 00:00:00 DOB 11235 BROOKLYN 40.581568 -73.958553 142 days 08:36:00
    173841 2016-09-02 11:38:00 2017-01-24 10:02:00 DPR 11228 BROOKLYN 40.615314 -74.006804 143 days 22:24:00
    174256 2016-09-02 14:07:00 2017-01-28 00:00:00 DOB 11373 QUEENS 40.736084 -73.871790 147 days 09:53:00
    194295 2016-09-02 15:33:00 2017-01-25 08:04:00 DPR 11234 BROOKLYN 40.624334 -73.917466 144 days 16:31:00
    194296 2016-09-02 15:47:00 2017-01-24 00:00:00 DOB 11356 QUEENS 40.788745 -73.853007 143 days 08:13:00
    223853 2016-09-03 16:22:00 2017-01-24 10:01:00 DPR 11223 BROOKLYN 40.602380 -73.982466 142 days 17:39:00
    224063 2016-09-03 10:34:00 2017-01-25 00:00:00 DOB 11220 BROOKLYN 40.648797 -74.011917 143 days 13:26:00
    279023 2016-09-07 14:05:00 2017-01-26 17:01:00 DPR 11225 BROOKLYN 40.658557 -73.952496 141 days 02:56:00
    279371 2016-09-07 14:17:00 2017-01-26 17:01:00 DPR 11225 BROOKLYN 40.658587 -73.952262 141 days 02:44:00
    279375 2016-09-07 12:13:00 2017-01-27 15:01:00 DPR 11226 BROOKLYN 40.647595 -73.948241 142 days 02:48:00
    290214 2016-09-04 13:06:00 2017-01-27 08:36:00 DOT 11435 QUEENS 40.712113 -73.819291 144 days 19:30:00
    299493 2016-09-02 14:08:00 2017-01-23 07:13:00 DOT 11385 QUEENS 40.709934 -73.897872 142 days 17:05:00
    299505 2016-09-03 15:38:00 2017-01-26 16:02:00 DPR 10458 BRONX 40.859428 -73.891113 145 days 00:24:00
    303524 2016-09-01 04:13:00 2017-01-22 00:00:00 DOB 11413 QUEENS 40.656629 -73.753756 142 days 19:47:00
    338097 2016-09-01 13:26:00 2017-01-24 00:00:00 DOB 11379 QUEENS 40.722387 -73.875947 144 days 10:34:00
    338103 2016-09-03 10:42:00 2017-01-24 00:00:00 DOB 11220 BROOKLYN 40.648707 -74.011766 142 days 13:18:00
    338107 2016-09-07 13:49:00 2017-01-28 14:01:00 DPR 11210 BROOKLYN 40.620744 -73.949580 143 days 00:12:00
    370926 2016-09-05 09:22:00 2017-01-27 08:59:00 DPR 11366 QUEENS 40.732338 -73.776576 143 days 23:37:00
    431150 2016-09-05 00:46:00 2017-01-23 11:16:00 DOT 11233 BROOKLYN 40.678364 -73.908137 140 days 10:30:00
    470712 2016-09-03 14:08:00 2017-01-25 00:00:00 DOB 11354 QUEENS 40.773811 -73.847050 143 days 09:52:00
    472657 2016-09-02 08:46:00 2017-01-24 18:00:00 DPR 11238 BROOKLYN 40.681132 -73.967349 144 days 09:14:00
    472768 2016-09-02 17:08:00 2017-01-27 08:01:00 DPR 11214 BROOKLYN 40.589295 -73.984219 146 days 14:53:00
    473340 2016-09-02 14:35:00 2017-01-27 08:02:00 DPR 11223 BROOKLYN 40.588090 -73.967877 146 days 17:27:00
    473358 2016-09-02 17:07:00 2017-01-27 08:02:00 DPR 11214 BROOKLYN 40.589295 -73.984568 146 days 14:55:00
    473549 2016-09-02 09:31:00 2017-01-25 08:01:00 DPR 11226 BROOKLYN 40.639232 -73.962131 144 days 22:30:00
    477723 2016-09-06 16:16:00 2017-01-28 14:02:00 DPR 11210 BROOKLYN 40.621882 -73.954623 143 days 21:46:00
    478807 2016-09-06 11:59:00 2017-01-28 15:01:00 DPR 11103 QUEENS 40.763953 -73.918272 144 days 03:02:00
    478939 2016-09-06 17:30:00 2017-01-27 00:00:00 DOB 11235 BROOKLYN 40.585533 -73.938302 142 days 06:30:00
    521571 2016-09-07 00:55:00 2017-01-25 08:01:00 DPR 11234 BROOKLYN 40.618109 -73.914095 140 days 07:06:00
    522055 2016-09-06 09:42:00 2017-01-27 00:00:00 DOB 11235 BROOKLYN 40.587920 -73.940532 142 days 14:18:00
    533600 2016-09-07 19:36:00 2017-01-26 10:16:00 DOT 11412 QUEENS 40.692284 -73.752510 140 days 14:40:00
    558640 2016-09-08 13:36:00 2017-01-29 14:00:00 DPR 11370 QUEENS 40.755721 -73.893939 143 days 00:24:00
    569755 2016-09-08 09:48:00 2017-01-28 15:01:00 DPR 11204 BROOKLYN 40.630970 -73.983910 142 days 05:13:00
    614925 2016-09-06 15:30:00 2017-01-25 16:22:00 DPR 11412 QUEENS 40.701398 -73.758305 141 days 00:52:00
    681064 2016-09-01 23:31:00 2017-01-24 14:11:00 DOT 10469 BRONX 40.864486 -73.843389 144 days 14:40:00
    711401 2016-09-07 09:57:00 2017-01-27 00:00:00 DOB 11373 QUEENS 40.734269 -73.883574 141 days 14:03:00
    836666 2016-09-10 12:25:00 2017-01-29 00:00:00 DOB 11423 QUEENS 40.721838 -73.774479 140 days 11:35:00

    Looks like the upper end makes sense but the negative times don't

    Though we need to explore this more, we'll get rid of negative times for now

    
    
    In [35]:
    data = data[data['processing_time']>=datetime.timedelta(0,0,0)]
    
    
    
    In [36]:
    data.info()
    
    
    
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 799325 entries, 11 to 971063
    Data columns (total 8 columns):
    Created Date       799325 non-null datetime64[ns]
    Closed Date        799325 non-null datetime64[ns]
    Agency             799325 non-null object
    Incident Zip       799325 non-null object
    Borough            799325 non-null object
    Latitude           799325 non-null float64
    Longitude          799325 non-null float64
    processing_time    799325 non-null timedelta64[ns]
    dtypes: datetime64[ns](2), float64(2), object(3), timedelta64[ns](1)
    memory usage: 54.9+ MB
    

    Finally, let's write a function that incorporates all our changes

    
    
    In [ ]:
    def read_311_data(datafile):
        import pandas as pd
        import numpy as np
        #Add the fix_zip function
        def fix_zip(input_zip):
            try:
                input_zip = int(float(input_zip))
            except:
                try:
                    input_zip = int(input_zip.split('-')[0])
                except:
                    return np.NaN
            if input_zip < 10000 or input_zip > 19999:
                return np.NaN
            return str(input_zip)
        
        #Read the file
        df = pd.read_csv(datafile,index_col='Unique Key')
        
        #fix the zip
        df['Incident Zip'] = df['Incident Zip'].apply(fix_zip)
        
        #drop all rows that have any nans in them (note the easier syntax!)
        
        df = df.dropna(how='any')
        
        #get rid of unspecified boroughs
        df = df[df['Borough'] != 'Unspecified']
        
        #Convert times to datetime and create a processing time column
        
        import datetime
        df['Created Date'] = df['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
        df['Closed Date'] = df['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
        df['processing_time'] =  df['Closed Date'] - df['Created Date']
        
        #Finally, get rid of negative processing times and return the final data frame
        
        df = df[df['processing_time']>=datetime.timedelta(0,0,0)]
        
        return df
    
    
    
    In [ ]:
    df = read_311_data('nyc_311_data_subset.csv')
    df.info()
    
    
    
    In [ ]: