notebook.community

Edit and run



In [2]:

    
import pg8000
from pandas import DataFrame
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com",port=5432,database="training",user="dot_student",password="qgis")



In [3]:

    
conn.rollback()



In [4]:

    
cursor = conn.cursor()



In [5]:

    
database=cursor.execute("SELECT * FROM dot_311")
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_sql("SELECT * FROM dot_311", conn)



In [6]:

    
df.head()









    Out[6]:






  
    
      
      b'gid'
      b'unique_key'
      b'agency'
      b'agency nam'
      b'complaint'
      b'descriptor'
      b'location t'
      b'incident z'
      b'incident a'
      b'street nam'
      ...
      b'bridge h_2'
      b'garage lot'
      b'ferry dire'
      b'ferry term'
      b'latitude'
      b'longitude'
      b'location'
      b'geom'
      b'created_date'
      b'closed_date'
    
  
  
    
      0
      2
      32570549.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301.0
      None
      None
      ...
      None
      None
      None
      None
      40.613804
      -74.113374
      (40.61380375315792, -74.1133737388642)
      0101000020E61000006829ED83418752C0D7FC121F914E...
      2016-02-01 00:12:00
      2016-02-01 01:15:00
    
    
      1
      3
      32572958.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301.0
      None
      None
      ...
      None
      None
      None
      None
      40.626545
      -74.091582
      (40.62654529746493, -74.09158233448058)
      0101000020E610000011DF267CDC8552C08809E5A23250...
      2016-02-01 00:14:00
      2016-02-01 00:54:00
    
    
      2
      4
      32573576.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10310.0
      None
      None
      ...
      None
      None
      None
      None
      40.629250
      -74.123221
      (40.62924955478749, -74.1232207526046)
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:15:00
      2016-02-01 01:30:00
    
    
      3
      5
      32572389.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      NaN
      None
      None
      ...
      None
      None
      None
      None
      NaN
      NaN
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:17:00
      2016-02-01 02:35:00
    
    
      4
      6
      32570548.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      NaN
      None
      None
      ...
      None
      None
      None
      None
      NaN
      NaN
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:19:00
      2016-02-01 01:00:00
    
  

5 rows × 55 columns



In [7]:

    
df=df.rename(columns = lambda x : str(x)[1:])
df.columns = [c.replace('\'', '') for c in df.columns]



In [11]:

    
df.columns









    Out[11]:





Index(['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor',
       'location t', 'incident z', 'incident a', 'street nam', 'cross stre',
       'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city',
       'landmark', 'facility t', 'status', 'due date', 'resolution',
       'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina',
       'park facil', 'park borou', 'school nam', 'school num', 'school reg',
       'school cod', 'school pho', 'school add', 'school cit', 'school sta',
       'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa',
       'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2',
       'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude',
       'location', 'geom', 'created_date', 'closed_date', 'duration'],
      dtype='object')



In [12]:

    
df['duration']=df['closed_date']-df['created_date']



In [13]:

    
df['duration'].head()









    Out[13]:





0   01:03:00
1   00:40:00
2   01:15:00
3   02:18:00
4   00:41:00
Name: duration, dtype: timedelta64[ns]



In [14]:

    
## AVERAGE TIME TAKEN TO DEAL WITH A COMPLAINT : 
df['duration'].mean()









    Out[14]:





Timedelta('6 days 15:05:05.589380')

But we notice that there are discrepancies in the data. For example:



In [15]:

    
df['duration'].max()









    Out[15]:





Timedelta('89 days 18:54:00')



In [16]:

    
df['duration'].min()









    Out[16]:





Timedelta('-19 days +09:29:00')

There are complaints that take negative days! So it is essential we see the median, so that outliers like these don't affect our understanding.



In [ ]:



In [17]:

    
df['duration'].median()









    Out[17]:





Timedelta('0 days 21:48:15')

So on an average, the DOT takes a little under a day to deal with most complaints.



In [18]:

    
df['duration'].mode()









    Out[18]:





0   0 days
dtype: timedelta64[ns]

The range of the dataframe is:



In [19]:

    
df['duration'].max()-df['duration'].min()









    Out[19]:





Timedelta('108 days 09:25:00')

Now the quartiles of the data.



In [20]:

    
df['duration'].quantile(q=0.25)









    Out[20]:





Timedelta('0 days 01:14:00')



In [21]:

    
df['duration'].quantile(q=0.50)









    Out[21]:





Timedelta('0 days 21:48:15')



In [22]:

    
df['duration'].quantile(q=0.75)









    Out[22]:





Timedelta('4 days 00:30:48.500000')



In [23]:

    
df['duration'].std()









    Out[23]:





Timedelta('14 days 12:05:38.260805')



In [26]:

    
df['duration'].quantile(q=0.75) - df['duration'].quantile(q=0.25) #IQR









    Out[26]:





Timedelta('3 days 23:16:48.500000')



In [29]:

    
1.5 * (df['duration'].quantile(q=0.75) - df['duration'].quantile(q=0.25))









    Out[29]:





Timedelta('5 days 22:55:12.750000')

So Five Days and 22 Hours Above or Below the IQR,lie the outliers.



In [ ]:



In [ ]:

	b'gid'	b'unique_key'	b'agency'	b'agency nam'	b'complaint'	b'descriptor'	b'location t'	b'incident z'	b'incident a'	b'street nam'	...	b'bridge h_2'	b'garage lot'	b'ferry dire'	b'ferry term'	b'latitude'	b'longitude'	b'location'	b'geom'	b'created_date'	b'closed_date'
0	2	32570549.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301.0	None	None	...	None	None	None	None	40.613804	-74.113374	(40.61380375315792, -74.1133737388642)	0101000020E61000006829ED83418752C0D7FC121F914E...	2016-02-01 00:12:00	2016-02-01 01:15:00
1	3	32572958.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301.0	None	None	...	None	None	None	None	40.626545	-74.091582	(40.62654529746493, -74.09158233448058)	0101000020E610000011DF267CDC8552C08809E5A23250...	2016-02-01 00:14:00	2016-02-01 00:54:00
2	4	32573576.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10310.0	None	None	...	None	None	None	None	40.629250	-74.123221	(40.62924955478749, -74.1232207526046)	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:15:00	2016-02-01 01:30:00
3	5	32572389.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	NaN	None	None	...	None	None	None	None	NaN	NaN	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:17:00	2016-02-01 02:35:00
4	6	32570548.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	NaN	None	None	...	None	None	None	None	NaN	NaN	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:19:00	2016-02-01 01:00:00