In [1]:
import pg8000
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

Connecting to the database


In [242]:
conn = pg8000.connect(user = 'dot_student', database='training', port=5432, host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', password='qgis')

In [177]:
conn.rollback()

In [178]:
cursor = conn.cursor()

In [179]:
cursor.execute("SELECT column_name FROM information_schema.columns WHERE table_name='dot_311'")
# run the commented out code to see list of all column names
# cursor.fetchall()

In [188]:
cursor.execute("SELECT complaint, descriptor, created_date, closed_date FROM dot_311")
dot311 = cursor.fetchall()

In [190]:
# dot311

Create a list of dictionaries that have the following structure:

[
...



 {'closed_date': [datetime.datetime(2016, 2, 2, 9, 1)],
  'complaint': ['Street Condition'],
  'created_date': [datetime.datetime(2016, 2, 1, 7, 0)],
  'descriptor': ['Pothole'],
  'time_open': datetime.timedelta(1, 7260)},
 {'closed_date': [datetime.datetime(2016, 2, 2, 12, 50)],
  'complaint': ['Street Condition'],
  'created_date': [datetime.datetime(2016, 2, 1, 7, 0)],
  'descriptor': ['Pothole'],
  'time_open': datetime.timedelta(1, 21000)}, 


  ...

  ]

In [199]:
dot311_list=[]

for item in dot311: 
    
    dot311_dict={}
    
    dot311_dict['complaint'] = [item[0]]
    dot311_dict['descriptor'] = [item[1]]
    dot311_dict['created_date'] = [item[2]]
    dot311_dict['closed_date'] = [item[3]]
    
    # checks that created_date and closed_date are not NoneTypes and that the closed date is after the created date. 
    if dot311_dict['created_date'][0] != None and dot311_dict['closed_date'][0] != None and dot311_dict['closed_date']>dot311_dict['created_date']: 
        # subtracting datetime.datetime objects gives you a dateime.timedelta object
        # helpful: http://stackoverflow.com/questions/2861770/how-do-i-subtract-two-dates-in-django-python
        dot311_dict['time_open'] = dot311_dict['closed_date'][0] - dot311_dict['created_date'][0]
    else: 
        dot311_dict['time_open'] = None
    
    dot311_list.append(dot311_dict)

In [200]:
# dot311_list

Create a dataframe from list of dicts


In [201]:
df = pd.DataFrame.from_dict(dot311_list)

In [202]:
df.head(50)


Out[202]:
closed_date complaint created_date descriptor time_open
0 [2016-02-01 01:15:00] [Traffic Signal Condition] [2016-02-01 00:12:00] [Controller] 0 days 01:03:00
1 [2016-02-01 00:54:00] [Traffic Signal Condition] [2016-02-01 00:14:00] [Controller] 0 days 00:40:00
2 [2016-02-01 01:30:00] [Traffic Signal Condition] [2016-02-01 00:15:00] [Controller] 0 days 01:15:00
3 [2016-02-01 02:35:00] [Traffic Signal Condition] [2016-02-01 00:17:00] [Controller] 0 days 02:18:00
4 [2016-02-01 01:00:00] [Traffic Signal Condition] [2016-02-01 00:19:00] [Controller] 0 days 00:41:00
5 [2016-02-01 02:00:00] [Traffic Signal Condition] [2016-02-01 00:21:00] [Controller] 0 days 01:39:00
6 [2016-02-01 01:30:00] [Traffic Signal Condition] [2016-02-01 00:22:00] [Controller] 0 days 01:08:00
7 [2016-02-01 01:05:00] [Traffic Signal Condition] [2016-02-01 00:27:00] [Controller] 0 days 00:38:00
8 [2016-02-01 00:42:00] [Traffic Signal Condition] [2016-02-01 00:32:00] [Base Door] 0 days 00:10:00
9 [2016-02-01 22:30:00] [Street Condition] [2016-02-01 00:32:16] [Pothole] 0 days 21:57:44
10 [2016-02-01 20:44:42] [Street Condition] [2016-02-01 00:40:25] [Plate Condition - Noisy] 0 days 20:04:17
11 [2016-02-08 12:29:00] [Street Light Condition] [2016-02-01 00:44:00] [Street Light Out] 7 days 11:45:00
12 [2016-02-10 09:10:00] [Traffic Signal Condition] [2016-02-01 00:50:00] [Post] 9 days 08:20:00
13 [2016-02-01 01:13:00] [Traffic Signal Condition] [2016-02-01 00:53:00] [Pedestrian Signal] 0 days 00:20:00
14 [None] [Street Light Condition] [2016-02-01 14:26:00] [Lamppost Knocked Down] NaT
15 [2016-02-01 13:45:00] [Street Light Condition] [2016-02-01 00:57:00] [Street Light Out] 0 days 12:48:00
16 [2016-02-03 16:59:00] [Street Condition] [2016-02-01 01:18:49] [Line/Marking - Faded] 2 days 15:40:11
17 [2016-02-05 12:04:00] [Street Light Condition] [2016-02-01 01:20:00] [Street Light Out] 4 days 10:44:00
18 [2016-02-04 10:37:00] [Street Light Condition] [2016-02-01 01:36:00] [Street Light Out] 3 days 09:01:00
19 [2016-02-02 13:09:00] [Street Light Condition] [2016-02-01 01:36:00] [Street Light Out] 1 days 11:33:00
20 [2016-02-01 10:23:00] [Street Condition] [2016-02-01 01:51:36] [Pothole] 0 days 08:31:24
21 [2016-02-08 15:25:00] [Street Light Condition] [2016-02-01 02:07:00] [Street Light Out] 7 days 13:18:00
22 [2016-02-04 12:06:14] [Sidewalk Condition] [2016-02-01 02:32:08] [Newspaper Box Complaint] 3 days 09:34:06
23 [2016-02-01 09:40:00] [Street Condition] [2016-02-01 02:43:01] [Pothole] 0 days 06:56:59
24 [2016-02-02 14:57:00] [Street Light Condition] [2016-02-01 03:21:00] [Street Light Out] 1 days 11:36:00
25 [2016-02-01 11:30:00] [Traffic Signal Condition] [2016-02-01 03:24:00] [Controller] 0 days 08:06:00
26 [2016-02-01 13:44:00] [Street Light Condition] [2016-02-01 03:25:00] [Street Light Out] 0 days 10:19:00
27 [2016-02-03 10:03:00] [Street Light Condition] [2016-02-01 03:26:00] [Street Light Out] 2 days 06:37:00
28 [2016-02-01 04:00:00] [Traffic Signal Condition] [2016-02-01 03:37:00] [LED Pedestrian Unit] 0 days 00:23:00
29 [2016-02-01 09:57:00] [Street Condition] [2016-02-01 03:49:22] [Pothole] 0 days 06:07:38
30 [2016-02-08 15:23:00] [Street Light Condition] [2016-02-06 15:22:00] [Street Light Out] 2 days 00:01:00
31 [2016-02-02 07:25:35] [Street Condition] [2016-02-01 03:59:57] [Cave-in] 1 days 03:25:38
32 [2016-03-31 09:45:00] [Traffic Signal Condition] [2016-02-01 04:05:00] [Controller] 59 days 05:40:00
33 [2016-02-01 04:35:00] [Traffic Signal Condition] [2016-02-01 04:15:00] [LED Pedestrian Unit] 0 days 00:20:00
34 [2016-02-24 11:15:00] [Traffic Signal Condition] [2016-02-01 04:48:00] [Controller] 23 days 06:27:00
35 [2016-02-02 03:00:00] [Traffic Signal Condition] [2016-02-01 05:12:00] [Veh Signal Lamp] 0 days 21:48:00
36 [2016-02-01 15:18:44] [Street Condition] [2016-02-01 05:25:36] [Defective Hardware] 0 days 09:53:08
37 [2016-02-02 13:12:00] [Street Light Condition] [2016-02-01 05:46:00] [Street Light Out] 1 days 07:26:00
38 [2016-04-04 08:30:00] [Traffic Signal Condition] [2016-02-01 05:59:00] [Controller] 63 days 02:31:00
39 [2016-02-01 10:00:00] [Street Condition] [2016-02-01 06:20:11] [Pothole] 0 days 03:39:49
40 [2016-02-01 10:15:00] [Traffic Signal Condition] [2016-02-01 06:36:00] [Veh Sgnl Mult Lamps] 0 days 03:39:00
41 [2016-02-01 11:30:00] [Street Condition] [2016-02-01 06:38:49] [Pothole] 0 days 04:51:11
42 [2016-02-01 14:07:00] [Street Light Condition] [2016-02-01 06:41:00] [Street Light Out] 0 days 07:26:00
43 [2016-02-02 10:12:11] [Sidewalk Condition] [2016-02-01 06:48:29] [Sidewalk Collapsed] 1 days 03:23:42
44 [2016-02-01 19:45:00] [Traffic Signal Condition] [2016-02-01 15:35:00] [Veh Signal Lamp] 0 days 04:10:00
45 [2016-02-02 09:10:00] [Street Condition] [2016-02-01 06:56:07] [Pothole] 1 days 02:13:53
46 [2016-01-29 01:00:00] [Street Light Condition] [2016-02-01 14:48:00] [Street Light Out] NaT
47 [2016-02-02 09:45:00] [Traffic Signal Condition] [2016-02-01 06:57:00] [Controller] 1 days 02:48:00
48 [2016-02-02 00:40:00] [Street Condition] [2016-02-01 06:58:59] [Pothole] 0 days 17:41:01
49 [2016-02-02 12:54:00] [Street Condition] [2016-02-01 07:00:00] [Pothole] 1 days 05:54:00

FIVE NUMBER SUMMARY for the length of time between creating the complaint and closing the complaint

minimum, maximum, first quartile, median, third quartile


In [284]:
minimum = df['time_open'].min()
print(minimum)
# dang, so fast.


0 days 00:00:01

In [283]:
maximum = df['time_open'].max()
print(maximum)


89 days 18:54:00

In [282]:
first_qtr = df['time_open'].quantile(q=0.25)
print(first_qtr)


0 days 12:41:58

In [285]:
median = df['time_open'].quantile(q=0.5)
print(median)


1 days 03:33:17

In [286]:
third_qtr = df['time_open'].quantile(q=0.75)
print(third_qtr)


6 days 00:30:23.500000

In [235]:
df['time_open_hours'] = df['time_open'].apply(lambda x: x.total_seconds()) / 60 / 60

In [256]:
# note: this was necessary because the by parameter of the box plot only accepts sequences, i.e. tuples and other such things
# if you try to use the original complaint column it returns an error saying that "by" only accepts sequences, not groupby sequences 
df['complaint_'] = df['complaint'].apply(lambda x: (x[0]))

A quick boxplot of hours a complaint is open, grouped by type of complaint

*the alignment on this graph is shifted one to the right


In [281]:
ax = df.boxplot(column='time_open_hours', by='complaint_', figsize=(20,20), rot=45, fontsize=14)