In [2]:
import pg8000
from pandas import DataFrame
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com",port=5432,database="training",user="dot_student",password="qgis")

In [3]:
conn.rollback()

In [4]:
cursor = conn.cursor()

In [5]:
database=cursor.execute("SELECT * FROM dot_311")
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_sql("SELECT * FROM dot_311", conn)

In [6]:
df.head()


Out[6]:
b'gid' b'unique_key' b'agency' b'agency nam' b'complaint' b'descriptor' b'location t' b'incident z' b'incident a' b'street nam' ... b'bridge h_2' b'garage lot' b'ferry dire' b'ferry term' b'latitude' b'longitude' b'location' b'geom' b'created_date' b'closed_date'
0 2 32570549.0 DOT Department of Transportation Traffic Signal Condition Controller None 10301.0 None None ... None None None None 40.613804 -74.113374 (40.61380375315792, -74.1133737388642) 0101000020E61000006829ED83418752C0D7FC121F914E... 2016-02-01 00:12:00 2016-02-01 01:15:00
1 3 32572958.0 DOT Department of Transportation Traffic Signal Condition Controller None 10301.0 None None ... None None None None 40.626545 -74.091582 (40.62654529746493, -74.09158233448058) 0101000020E610000011DF267CDC8552C08809E5A23250... 2016-02-01 00:14:00 2016-02-01 00:54:00
2 4 32573576.0 DOT Department of Transportation Traffic Signal Condition Controller None 10310.0 None None ... None None None None 40.629250 -74.123221 (40.62924955478749, -74.1232207526046) 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:15:00 2016-02-01 01:30:00
3 5 32572389.0 DOT Department of Transportation Traffic Signal Condition Controller None NaN None None ... None None None None NaN NaN None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:17:00 2016-02-01 02:35:00
4 6 32570548.0 DOT Department of Transportation Traffic Signal Condition Controller None NaN None None ... None None None None NaN NaN None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:19:00 2016-02-01 01:00:00

5 rows × 55 columns


In [7]:
df=df.rename(columns = lambda x : str(x)[1:])
df.columns = [c.replace('\'', '') for c in df.columns]

In [11]:
df.columns


Out[11]:
Index(['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor',
       'location t', 'incident z', 'incident a', 'street nam', 'cross stre',
       'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city',
       'landmark', 'facility t', 'status', 'due date', 'resolution',
       'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina',
       'park facil', 'park borou', 'school nam', 'school num', 'school reg',
       'school cod', 'school pho', 'school add', 'school cit', 'school sta',
       'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa',
       'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2',
       'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude',
       'location', 'geom', 'created_date', 'closed_date', 'duration'],
      dtype='object')

In [12]:
df['duration']=df['closed_date']-df['created_date']

In [13]:
df['duration'].head()


Out[13]:
0   01:03:00
1   00:40:00
2   01:15:00
3   02:18:00
4   00:41:00
Name: duration, dtype: timedelta64[ns]

In [14]:
## AVERAGE TIME TAKEN TO DEAL WITH A COMPLAINT : 
df['duration'].mean()


Out[14]:
Timedelta('6 days 15:05:05.589380')

But we notice that there are discrepancies in the data. For example:


In [15]:
df['duration'].max()


Out[15]:
Timedelta('89 days 18:54:00')

In [16]:
df['duration'].min()


Out[16]:
Timedelta('-19 days +09:29:00')

There are complaints that take negative days! So it is essential we see the median, so that outliers like these don't affect our understanding.


In [ ]:


In [17]:
df['duration'].median()


Out[17]:
Timedelta('0 days 21:48:15')

So on an average, the DOT takes a little under a day to deal with most complaints.


In [18]:
df['duration'].mode()


Out[18]:
0   0 days
dtype: timedelta64[ns]

The range of the dataframe is:


In [19]:
df['duration'].max()-df['duration'].min()


Out[19]:
Timedelta('108 days 09:25:00')

Now the quartiles of the data.


In [20]:
df['duration'].quantile(q=0.25)


Out[20]:
Timedelta('0 days 01:14:00')

In [21]:
df['duration'].quantile(q=0.50)


Out[21]:
Timedelta('0 days 21:48:15')

In [22]:
df['duration'].quantile(q=0.75)


Out[22]:
Timedelta('4 days 00:30:48.500000')

In [23]:
df['duration'].std()


Out[23]:
Timedelta('14 days 12:05:38.260805')

In [26]:
df['duration'].quantile(q=0.75) - df['duration'].quantile(q=0.25) #IQR


Out[26]:
Timedelta('3 days 23:16:48.500000')

In [29]:
1.5 * (df['duration'].quantile(q=0.75) - df['duration'].quantile(q=0.25))


Out[29]:
Timedelta('5 days 22:55:12.750000')

So Five Days and 22 Hours Above or Below the IQR,lie the outliers.


In [ ]:


In [ ]: