In [10]:
import pandas as pd
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', port=5432, database='training', user='dot_student', password='qgis')

In [11]:
cursor = conn.cursor()

In [12]:
cursor.execute("SELECT * FROM dot_311")

In [13]:
dataset = []
for record in cursor.fetchall():
    dataset.append(record)

In [14]:
df = pd.DataFrame(dataset)

In [18]:
conn.rollback()

In [16]:
df.head()


Out[16]:
0 1 2 3 4 5 6 7 8 9 ... 45 46 47 48 49 50 51 52 53 54
0 2 32570549 DOT Department of Transportation Traffic Signal Condition Controller None 10301 None None ... None None None None 40.613803753157917 -74.113373738864198 (40.61380375315792, -74.1133737388642) 0101000020E61000006829ED83418752C0D7FC121F914E... 2016-02-01 00:12:00 2016-02-01 01:15:00
1 3 32572958 DOT Department of Transportation Traffic Signal Condition Controller None 10301 None None ... None None None None 40.626545297464929 -74.091582334480577 (40.62654529746493, -74.09158233448058) 0101000020E610000011DF267CDC8552C08809E5A23250... 2016-02-01 00:14:00 2016-02-01 00:54:00
2 4 32573576 DOT Department of Transportation Traffic Signal Condition Controller None 10310 None None ... None None None None 40.629249554787492 -74.123220752604595 (40.62924955478749, -74.1232207526046) 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:15:00 2016-02-01 01:30:00
3 5 32572389 DOT Department of Transportation Traffic Signal Condition Controller None None None None ... None None None None None None None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:17:00 2016-02-01 02:35:00
4 6 32570548 DOT Department of Transportation Traffic Signal Condition Controller None None None None ... None None None None None None None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:19:00 2016-02-01 01:00:00

5 rows × 55 columns


In [24]:
df.columns


Out[24]:
RangeIndex(start=0, stop=55, step=1)

In [28]:
selection = df[[1,53,54]]
selection.columns = ("unique_key", "created_date", "closed_date")

In [30]:
selection.head()


Out[30]:
unique_key created_date closed_date
0 32570549 2016-02-01 00:12:00 2016-02-01 01:15:00
1 32572958 2016-02-01 00:14:00 2016-02-01 00:54:00
2 32573576 2016-02-01 00:15:00 2016-02-01 01:30:00
3 32572389 2016-02-01 00:17:00 2016-02-01 02:35:00
4 32570548 2016-02-01 00:19:00 2016-02-01 01:00:00

In [38]:
selection['closed_date'] - selection['created_date']


Out[38]:
0       0 days 01:03:00
1       0 days 00:40:00
2       0 days 01:15:00
3       0 days 02:18:00
4       0 days 00:41:00
5       0 days 01:39:00
6       0 days 01:08:00
7       0 days 00:38:00
8       0 days 00:10:00
9       0 days 21:57:44
10      0 days 20:04:17
11      7 days 11:45:00
12      9 days 08:20:00
13      0 days 00:20:00
14                  NaT
15      0 days 12:48:00
16      2 days 15:40:11
17      4 days 10:44:00
18      3 days 09:01:00
19      1 days 11:33:00
20      0 days 08:31:24
21      7 days 13:18:00
22      3 days 09:34:06
23      0 days 06:56:59
24      1 days 11:36:00
25      0 days 08:06:00
26      0 days 10:19:00
27      2 days 06:37:00
28      0 days 00:23:00
29      0 days 06:07:38
             ...       
5821    2 days 17:15:49
5822    1 days 04:45:30
5823    1 days 04:44:21
5824    1 days 12:59:00
5825    2 days 13:35:00
5826    1 days 04:34:23
5827    1 days 16:04:26
5828   38 days 12:04:00
5829    4 days 11:02:00
5830   19 days 11:30:57
5831    0 days 21:04:18
5832    5 days 16:30:44
5833   15 days 11:14:55
5834   10 days 10:27:48
5835    1 days 05:13:55
5836    1 days 14:36:53
5837    1 days 12:28:00
5838    2 days 00:43:43
5839    9 days 10:42:25
5840    1 days 22:44:00
5841    0 days 04:21:00
5842    0 days 17:31:11
5843    0 days 20:04:58
5844    1 days 07:15:53
5845    0 days 21:33:40
5846    1 days 23:13:00
5847                NaT
5848    5 days 22:37:00
5849    1 days 10:13:27
5850    3 days 15:19:00
dtype: timedelta64[ns]

In [41]:
selection['time_open'] = selection['closed_date'] - selection['created_date']


/Users/gcg/.virtualenvs/ddj/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [43]:
selection


Out[43]:
unique_key created_date closed_date time_open
0 32570549 2016-02-01 00:12:00 2016-02-01 01:15:00 0 days 01:03:00
1 32572958 2016-02-01 00:14:00 2016-02-01 00:54:00 0 days 00:40:00
2 32573576 2016-02-01 00:15:00 2016-02-01 01:30:00 0 days 01:15:00
3 32572389 2016-02-01 00:17:00 2016-02-01 02:35:00 0 days 02:18:00
4 32570548 2016-02-01 00:19:00 2016-02-01 01:00:00 0 days 00:41:00
5 32574044 2016-02-01 00:21:00 2016-02-01 02:00:00 0 days 01:39:00
6 32573047 2016-02-01 00:22:00 2016-02-01 01:30:00 0 days 01:08:00
7 32572971 2016-02-01 00:27:00 2016-02-01 01:05:00 0 days 00:38:00
8 32575571 2016-02-01 00:32:00 2016-02-01 00:42:00 0 days 00:10:00
9 32576008 2016-02-01 00:32:16 2016-02-01 22:30:00 0 days 21:57:44
10 32573945 2016-02-01 00:40:25 2016-02-01 20:44:42 0 days 20:04:17
11 32570682 2016-02-01 00:44:00 2016-02-08 12:29:00 7 days 11:45:00
12 32572906 2016-02-01 00:50:00 2016-02-10 09:10:00 9 days 08:20:00
13 32579102 2016-02-01 00:53:00 2016-02-01 01:13:00 0 days 00:20:00
14 32578446 2016-02-01 14:26:00 NaT NaT
15 32573686 2016-02-01 00:57:00 2016-02-01 13:45:00 0 days 12:48:00
16 32570317 2016-02-01 01:18:49 2016-02-03 16:59:00 2 days 15:40:11
17 32581124 2016-02-01 01:20:00 2016-02-05 12:04:00 4 days 10:44:00
18 32579277 2016-02-01 01:36:00 2016-02-04 10:37:00 3 days 09:01:00
19 32580720 2016-02-01 01:36:00 2016-02-02 13:09:00 1 days 11:33:00
20 32571895 2016-02-01 01:51:36 2016-02-01 10:23:00 0 days 08:31:24
21 32580285 2016-02-01 02:07:00 2016-02-08 15:25:00 7 days 13:18:00
22 32575419 2016-02-01 02:32:08 2016-02-04 12:06:14 3 days 09:34:06
23 32570671 2016-02-01 02:43:01 2016-02-01 09:40:00 0 days 06:56:59
24 32581170 2016-02-01 03:21:00 2016-02-02 14:57:00 1 days 11:36:00
25 32581434 2016-02-01 03:24:00 2016-02-01 11:30:00 0 days 08:06:00
26 32575447 2016-02-01 03:25:00 2016-02-01 13:44:00 0 days 10:19:00
27 32577440 2016-02-01 03:26:00 2016-02-03 10:03:00 2 days 06:37:00
28 32577560 2016-02-01 03:37:00 2016-02-01 04:00:00 0 days 00:23:00
29 32575033 2016-02-01 03:49:22 2016-02-01 09:57:00 0 days 06:07:38
... ... ... ... ...
5821 32614085 2016-02-06 22:07:24 2016-02-09 15:23:13 2 days 17:15:49
5822 32612339 2016-02-06 22:23:07 2016-02-08 03:08:37 1 days 04:45:30
5823 32612333 2016-02-06 22:25:28 2016-02-08 03:09:49 1 days 04:44:21
5824 32614717 2016-02-06 22:27:00 2016-02-08 11:26:00 1 days 12:59:00
5825 32617685 2016-02-06 22:32:00 2016-02-09 12:07:00 2 days 13:35:00
5826 32615195 2016-02-06 22:34:14 2016-02-08 03:08:37 1 days 04:34:23
5827 32613947 2016-02-06 22:34:34 2016-02-08 14:39:00 1 days 16:04:26
5828 32612566 2016-02-06 22:35:00 2016-03-16 10:39:00 38 days 12:04:00
5829 32614725 2016-02-06 22:41:00 2016-02-11 09:43:00 4 days 11:02:00
5830 32617705 2016-02-06 22:42:29 2016-02-26 10:13:26 19 days 11:30:57
5831 32620216 2016-02-06 23:35:42 2016-02-07 20:40:00 0 days 21:04:18
5832 32617828 2016-02-06 22:47:30 2016-02-12 15:18:14 5 days 16:30:44
5833 32618570 2016-02-06 22:49:43 2016-02-22 10:04:38 15 days 11:14:55
5834 32616052 2016-02-06 22:53:35 2016-02-17 09:21:23 10 days 10:27:48
5835 32612279 2016-02-06 22:55:05 2016-02-08 04:09:00 1 days 05:13:55
5836 32614241 2016-02-06 22:56:01 2016-02-08 13:32:54 1 days 14:36:53
5837 32617383 2016-02-06 22:59:00 2016-02-08 11:27:00 1 days 12:28:00
5838 32618573 2016-02-06 23:02:17 2016-02-08 23:46:00 2 days 00:43:43
5839 32617669 2016-02-06 23:02:35 2016-02-16 09:45:00 9 days 10:42:25
5840 32612533 2016-02-06 23:06:00 2016-02-08 21:50:00 1 days 22:44:00
5841 32618378 2016-02-06 23:09:00 2016-02-07 03:30:00 0 days 04:21:00
5842 32622707 2016-02-06 23:33:49 2016-02-07 17:05:00 0 days 17:31:11
5843 32622721 2016-02-06 23:35:02 2016-02-07 19:40:00 0 days 20:04:58
5844 32613217 2016-02-06 23:36:15 2016-02-08 06:52:08 1 days 07:15:53
5845 32623981 2016-02-06 23:36:20 2016-02-07 21:10:00 0 days 21:33:40
5846 32617301 2016-02-06 23:37:00 2016-02-08 22:50:00 1 days 23:13:00
5847 32614083 2016-02-06 23:46:22 NaT NaT
5848 32612807 2016-02-06 23:53:00 2016-02-12 22:30:00 5 days 22:37:00
5849 32617735 2016-02-06 23:54:02 2016-02-08 10:07:29 1 days 10:13:27
5850 32619242 2016-02-07 00:00:00 2016-02-10 15:19:00 3 days 15:19:00

5851 rows × 4 columns


In [44]:
selection['time_open'].mean()


Out[44]:
Timedelta('6 days 15:05:05.589380')

In [45]:
selection['time_open'].median()


Out[45]:
Timedelta('0 days 21:48:15')

In [47]:
range_time_open = max(selection['time_open']) - min(selection['time_open'])

In [48]:
range_time_open


Out[48]:
Timedelta('108 days 09:25:00')

In [49]:
selection['time_open'].std()


Out[49]:
Timedelta('14 days 12:05:38.260805')

In [51]:
selection['time_open'].quantile(q=0.25)


Out[51]:
Timedelta('0 days 01:14:00')

In [52]:
selection['time_open'].quantile(q=0.5)


Out[52]:
Timedelta('0 days 21:48:15')

In [53]:
selection['time_open'].quantile(q=0.75)


Out[53]:
Timedelta('4 days 00:30:48.500000')

In [55]:
iqr = selection['time_open'].quantile(q=0.75) - selection['time_open'].quantile(q=0.25)
iqr


Out[55]:
Timedelta('3 days 23:16:48.500000')

In [57]:
#check for outliers

In [59]:
ual = selection['time_open'].quantile(q=0.75) + (iqr * 1.5)
ual


Out[59]:
Timedelta('9 days 23:26:01.250000')

In [60]:
lal = selection['time_open'].quantile(q=0.25) - (iqr * 1.5)
lal


Out[60]:
Timedelta('-6 days +02:18:47.250000')

In [ ]: