In [1]:

    
import pg8000
import pandas as pd
import matplotlib
import numpy
%matplotlib inline
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", port=5432, database="training", user="dot_student", password="qgis")



In [2]:

    
cursor = conn.cursor()



In [3]:

    
conn.rollback()



In [4]:

    
cursor.execute("SELECT * FROM dot_311")
results = []
for item in cursor.fetchall():
    results.append(item)



In [5]:

    
cursor.execute("select column_name from information_schema.columns where table_name='dot_311'")
columns_names = []
for item in cursor.fetchall():
    print(item)
    columns_names.append(item)









    



['gid']
['unique_key']
['agency']
['agency nam']
['complaint']
['descriptor']
['location t']
['incident z']
['incident a']
['street nam']
['cross stre']
['cross st_1']
['intersecti']
['intersec_1']
['address ty']
['city']
['landmark']
['facility t']
['status']
['due date']
['resolution']
['resoluti_1']
['community']
['borough']
['x coordina']
['y coordina']
['park facil']
['park borou']
['school nam']
['school num']
['school reg']
['school cod']
['school pho']
['school add']
['school cit']
['school sta']
['school zip']
['school not']
['school or']
['vehicle ty']
['taxi compa']
['taxi pick']
['bridge hig']
['bridge h_1']
['road ramp']
['bridge h_2']
['garage lot']
['ferry dire']
['ferry term']
['latitude']
['longitude']
['location']
['geom']
['created_date']
['closed_date']



In [6]:

    
df= pd.DataFrame(results)



In [7]:

    
df.columns = ['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor', 'location t', 'incident z', 'incident a', 'street nam', 'cross stre', 'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city', 'landmark', 'facility t', 'status', 'due date', 'resolution', 'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina', 'park facil', 'park borou', 'school nam', 'school num', 'school reg', 'school cod', 'school pho', 'school add', 'school cit', 'school sta', 'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa', 'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2', 'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude', 'location', 'geom', 'created_date', 'closed_date']



In [8]:

    
df.head()









    Out[8]:






  
    
      
      gid
      unique_key
      agency
      agency nam
      complaint
      descriptor
      location t
      incident z
      incident a
      street nam
      ...
      bridge h_2
      garage lot
      ferry dire
      ferry term
      latitude
      longitude
      location
      geom
      created_date
      closed_date
    
  
  
    
      0
      2
      32570549
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301
      None
      None
      ...
      None
      None
      None
      None
      40.613803753157917
      -74.113373738864198
      (40.61380375315792, -74.1133737388642)
      0101000020E61000006829ED83418752C0D7FC121F914E...
      2016-02-01 00:12:00
      2016-02-01 01:15:00
    
    
      1
      3
      32572958
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301
      None
      None
      ...
      None
      None
      None
      None
      40.626545297464929
      -74.091582334480577
      (40.62654529746493, -74.09158233448058)
      0101000020E610000011DF267CDC8552C08809E5A23250...
      2016-02-01 00:14:00
      2016-02-01 00:54:00
    
    
      2
      4
      32573576
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10310
      None
      None
      ...
      None
      None
      None
      None
      40.629249554787492
      -74.123220752604595
      (40.62924955478749, -74.1232207526046)
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:15:00
      2016-02-01 01:30:00
    
    
      3
      5
      32572389
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      None
      None
      None
      ...
      None
      None
      None
      None
      None
      None
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:17:00
      2016-02-01 02:35:00
    
    
      4
      6
      32570548
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      None
      None
      None
      ...
      None
      None
      None
      None
      None
      None
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:19:00
      2016-02-01 01:00:00
    
  

5 rows × 55 columns



In [9]:

    
df.columns









    Out[9]:





Index(['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor',
       'location t', 'incident z', 'incident a', 'street nam', 'cross stre',
       'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city',
       'landmark', 'facility t', 'status', 'due date', 'resolution',
       'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina',
       'park facil', 'park borou', 'school nam', 'school num', 'school reg',
       'school cod', 'school pho', 'school add', 'school cit', 'school sta',
       'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa',
       'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2',
       'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude',
       'location', 'geom', 'created_date', 'closed_date'],
      dtype='object')



In [10]:

    
df['new_created_date'] = pd.to_datetime(df['created_date'], 
                                format='%m/%d/%y %H:%M:%S')



In [11]:

    
df['new_closed_date'] = pd.to_datetime(df['closed_date'], 
                                format='%m/%d/%y %H:%M:%S')



In [12]:

    
df['time_open'] = df['new_closed_date'] - df['new_created_date']



In [13]:

    
df.head()









    Out[13]:






  
    
      
      gid
      unique_key
      agency
      agency nam
      complaint
      descriptor
      location t
      incident z
      incident a
      street nam
      ...
      ferry term
      latitude
      longitude
      location
      geom
      created_date
      closed_date
      new_created_date
      new_closed_date
      time_open
    
  
  
    
      0
      2
      32570549
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301
      None
      None
      ...
      None
      40.613803753157917
      -74.113373738864198
      (40.61380375315792, -74.1133737388642)
      0101000020E61000006829ED83418752C0D7FC121F914E...
      2016-02-01 00:12:00
      2016-02-01 01:15:00
      2016-02-01 00:12:00
      2016-02-01 01:15:00
      01:03:00
    
    
      1
      3
      32572958
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301
      None
      None
      ...
      None
      40.626545297464929
      -74.091582334480577
      (40.62654529746493, -74.09158233448058)
      0101000020E610000011DF267CDC8552C08809E5A23250...
      2016-02-01 00:14:00
      2016-02-01 00:54:00
      2016-02-01 00:14:00
      2016-02-01 00:54:00
      00:40:00
    
    
      2
      4
      32573576
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10310
      None
      None
      ...
      None
      40.629249554787492
      -74.123220752604595
      (40.62924955478749, -74.1232207526046)
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:15:00
      2016-02-01 01:30:00
      2016-02-01 00:15:00
      2016-02-01 01:30:00
      01:15:00
    
    
      3
      5
      32572389
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      None
      None
      None
      ...
      None
      None
      None
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:17:00
      2016-02-01 02:35:00
      2016-02-01 00:17:00
      2016-02-01 02:35:00
      02:18:00
    
    
      4
      6
      32570548
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      None
      None
      None
      ...
      None
      None
      None
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:19:00
      2016-02-01 01:00:00
      2016-02-01 00:19:00
      2016-02-01 01:00:00
      00:41:00
    
  

5 rows × 58 columns



In [14]:

    
df['time_open'].value_counts()









    Out[14]:





0 days 00:00:00      521
-3 days +00:00:00    184
-6 days +00:00:00    112
-2 days +00:00:00     27
-2 days +23:59:00     26
0 days 00:20:00       23
0 days 01:32:00       10
0 days 01:26:00       10
0 days 00:25:00        9
0 days 01:27:00        9
0 days 02:02:00        9
0 days 01:34:00        9
0 days 00:40:00        9
0 days 00:15:00        8
0 days 01:09:00        8
0 days 00:54:00        8
0 days 01:15:00        8
0 days 00:45:00        8
0 days 01:42:00        8
0 days 00:49:00        7
0 days 01:51:00        7
0 days 01:22:00        7
0 days 01:18:00        7
0 days 01:52:00        7
0 days 02:18:00        7
0 days 01:29:00        7
0 days 01:33:00        7
0 days 01:58:00        7
0 days 01:36:00        7
-4 days +23:59:00      7
                    ... 
0 days 23:53:09        1
0 days 23:53:25        1
29 days 13:19:50       1
0 days 06:54:59        1
0 days 21:32:37        1
10 days 16:07:27       1
0 days 03:14:00        1
0 days 04:56:26        1
3 days 16:46:05        1
43 days 18:11:57       1
1 days 06:05:30        1
1 days 02:06:00        1
43 days 15:03:49       1
1 days 18:18:00        1
2 days 18:07:22        1
0 days 08:05:00        1
0 days 01:54:23        1
12 days 16:55:00       1
2 days 05:16:45        1
5 days 23:40:03        1
7 days 20:52:00        1
50 days 00:25:00       1
0 days 19:02:57        1
0 days 12:56:00        1
-3 days +06:44:00      1
31 days 22:41:41       1
0 days 22:41:28        1
13 days 23:26:00       1
0 days 12:54:24        1
0 days 03:11:44        1
Name: time_open, dtype: int64



In [15]:

    
time_open = pd.DataFrame(df['time_open'][df['time_open'] > pd.Timedelta(minutes=0)].dropna())



In [16]:

    
time_open









    Out[16]:






  
    
      
      time_open
    
  
  
    
      0
      0 days 01:03:00
    
    
      1
      0 days 00:40:00
    
    
      2
      0 days 01:15:00
    
    
      3
      0 days 02:18:00
    
    
      4
      0 days 00:41:00
    
    
      5
      0 days 01:39:00
    
    
      6
      0 days 01:08:00
    
    
      7
      0 days 00:38:00
    
    
      8
      0 days 00:10:00
    
    
      9
      0 days 21:57:44
    
    
      10
      0 days 20:04:17
    
    
      11
      7 days 11:45:00
    
    
      12
      9 days 08:20:00
    
    
      13
      0 days 00:20:00
    
    
      15
      0 days 12:48:00
    
    
      16
      2 days 15:40:11
    
    
      17
      4 days 10:44:00
    
    
      18
      3 days 09:01:00
    
    
      19
      1 days 11:33:00
    
    
      20
      0 days 08:31:24
    
    
      21
      7 days 13:18:00
    
    
      22
      3 days 09:34:06
    
    
      23
      0 days 06:56:59
    
    
      24
      1 days 11:36:00
    
    
      25
      0 days 08:06:00
    
    
      26
      0 days 10:19:00
    
    
      27
      2 days 06:37:00
    
    
      28
      0 days 00:23:00
    
    
      29
      0 days 06:07:38
    
    
      30
      2 days 00:01:00
    
    
      ...
      ...
    
    
      5820
      4 days 15:00:42
    
    
      5821
      2 days 17:15:49
    
    
      5822
      1 days 04:45:30
    
    
      5823
      1 days 04:44:21
    
    
      5824
      1 days 12:59:00
    
    
      5825
      2 days 13:35:00
    
    
      5826
      1 days 04:34:23
    
    
      5827
      1 days 16:04:26
    
    
      5828
      38 days 12:04:00
    
    
      5829
      4 days 11:02:00
    
    
      5830
      19 days 11:30:57
    
    
      5831
      0 days 21:04:18
    
    
      5832
      5 days 16:30:44
    
    
      5833
      15 days 11:14:55
    
    
      5834
      10 days 10:27:48
    
    
      5835
      1 days 05:13:55
    
    
      5836
      1 days 14:36:53
    
    
      5837
      1 days 12:28:00
    
    
      5838
      2 days 00:43:43
    
    
      5839
      9 days 10:42:25
    
    
      5840
      1 days 22:44:00
    
    
      5841
      0 days 04:21:00
    
    
      5842
      0 days 17:31:11
    
    
      5843
      0 days 20:04:58
    
    
      5844
      1 days 07:15:53
    
    
      5845
      0 days 21:33:40
    
    
      5846
      1 days 23:13:00
    
    
      5848
      5 days 22:37:00
    
    
      5849
      1 days 10:13:27
    
    
      5850
      3 days 15:19:00
    
  

4547 rows × 1 columns



In [17]:

    
time_open.median()









    Out[17]:





time_open   1 days 03:33:17
dtype: timedelta64[ns]



In [18]:

    
time_open.mean()









    Out[18]:





time_open   8 days 16:15:03.514405
dtype: timedelta64[ns]



In [19]:

    
time_open.mode()



In [20]:

    
time_open['time_open'].quantile(q=0.25)









    Out[20]:





Timedelta('0 days 12:41:58')



In [21]:

    
time_open['time_open'].quantile(q=0.5)









    Out[21]:





Timedelta('1 days 03:33:17')



In [22]:

    
time_open['time_open'].quantile(q=0.75)









    Out[22]:





Timedelta('6 days 00:30:23.500000')



In [23]:

    
iqr = (time_open['time_open'].quantile(q=0.75)) - (time_open['time_open'].quantile(q=0.25))



In [24]:

    
iqr









    Out[24]:





Timedelta('5 days 11:48:25.500000')



In [25]:

    
ual = (iqr * 1.5) + time_open.median()



In [26]:

    
ual









    Out[26]:





time_open   9 days 09:15:55.250000
dtype: timedelta64[ns]



In [27]:

    
lal = (iqr * 1.5) - time_open.median()



In [28]:

    
lal









    Out[28]:





time_open   7 days 02:09:21.250000
dtype: timedelta64[ns]



In [29]:

    
numpy.std(time_open)









    Out[29]:





time_open   15 days 10:45:41.824078
dtype: timedelta64[ns]



In [30]:

    
time_open.max()









    Out[30]:





time_open   89 days 18:54:00
dtype: timedelta64[ns]



In [31]:

    
time_open.min()









    Out[31]:





time_open   00:00:01
dtype: timedelta64[ns]



In [32]:

    
data_range = time_open.max() - time_open.min()



In [33]:

    
data_range









    Out[33]:





time_open   89 days 18:53:59
dtype: timedelta64[ns]



In [67]:

    
(time_open.astype('timedelta64[m]')).hist(bins=100, figsize=(10,5))









    Out[67]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10df9a0f0>]], dtype=object)

This data are skewed by some outliers, especially in the Q4: while 75% of the complaints have been open 6 days or less, the longest complaint has been open for 89 days - which is also the range of this data -, and the shortest is only one second long. This is also shown by the standard deviation, that is 15 days, and the difference between the mean (1 day) and the mean (8 days and a half). In fact, the most common value is 20 minutes.

	gid	unique_key	agency	agency nam	complaint	descriptor	location t	incident z	incident a	street nam	...	bridge h_2	garage lot	ferry dire	ferry term	latitude	longitude	location	geom	created_date	closed_date
0	2	32570549	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301	None	None	...	None	None	None	None	40.613803753157917	-74.113373738864198	(40.61380375315792, -74.1133737388642)	0101000020E61000006829ED83418752C0D7FC121F914E...	2016-02-01 00:12:00	2016-02-01 01:15:00
1	3	32572958	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301	None	None	...	None	None	None	None	40.626545297464929	-74.091582334480577	(40.62654529746493, -74.09158233448058)	0101000020E610000011DF267CDC8552C08809E5A23250...	2016-02-01 00:14:00	2016-02-01 00:54:00
2	4	32573576	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10310	None	None	...	None	None	None	None	40.629249554787492	-74.123220752604595	(40.62924955478749, -74.1232207526046)	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:15:00	2016-02-01 01:30:00
3	5	32572389	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	None	None	None	...	None	None	None	None	None	None	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:17:00	2016-02-01 02:35:00
4	6	32570548	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	None	None	None	...	None	None	None	None	None	None	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:19:00	2016-02-01 01:00:00

	time_open
0	0 days 01:03:00
1	0 days 00:40:00
2	0 days 01:15:00
3	0 days 02:18:00
4	0 days 00:41:00
5	0 days 01:39:00
6	0 days 01:08:00
7	0 days 00:38:00
8	0 days 00:10:00
9	0 days 21:57:44
10	0 days 20:04:17
11	7 days 11:45:00
12	9 days 08:20:00
13	0 days 00:20:00
15	0 days 12:48:00
16	2 days 15:40:11
17	4 days 10:44:00
18	3 days 09:01:00
19	1 days 11:33:00
20	0 days 08:31:24
21	7 days 13:18:00
22	3 days 09:34:06
23	0 days 06:56:59
24	1 days 11:36:00
25	0 days 08:06:00
26	0 days 10:19:00
27	2 days 06:37:00
28	0 days 00:23:00
29	0 days 06:07:38
30	2 days 00:01:00
...	...
5820	4 days 15:00:42
5821	2 days 17:15:49
5822	1 days 04:45:30
5823	1 days 04:44:21
5824	1 days 12:59:00
5825	2 days 13:35:00
5826	1 days 04:34:23
5827	1 days 16:04:26
5828	38 days 12:04:00
5829	4 days 11:02:00
5830	19 days 11:30:57
5831	0 days 21:04:18
5832	5 days 16:30:44
5833	15 days 11:14:55
5834	10 days 10:27:48
5835	1 days 05:13:55
5836	1 days 14:36:53
5837	1 days 12:28:00
5838	2 days 00:43:43
5839	9 days 10:42:25
5840	1 days 22:44:00
5841	0 days 04:21:00
5842	0 days 17:31:11
5843	0 days 20:04:58
5844	1 days 07:15:53
5845	0 days 21:33:40
5846	1 days 23:13:00
5848	5 days 22:37:00
5849	1 days 10:13:27
5850	3 days 15:19:00