notebook.community

Edit and run



In [1]:

    
import pandas as pd
import pg8000



In [2]:

    
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')



In [3]:

    
cursor = conn.cursor()
cursor.execute("select column_name from information_schema.columns where table_name='dot_311'")
column_name = []
for row in cursor.fetchall():
    column_name.append(row[0])
print(column_name)









    



['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor', 'location t', 'incident z', 'incident a', 'street nam', 'cross stre', 'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city', 'landmark', 'facility t', 'status', 'due date', 'resolution', 'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina', 'park facil', 'park borou', 'school nam', 'school num', 'school reg', 'school cod', 'school pho', 'school add', 'school cit', 'school sta', 'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa', 'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2', 'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude', 'location', 'geom', 'created_date', 'closed_date']



In [4]:

    
cursor = conn.cursor()
cursor.execute("SELECT unique_key,complaint,created_date,closed_date from dot_311")
output = []
for row in cursor.fetchall():
    output.append(row)



In [5]:

    
df = pd.DataFrame(output)
df.columns = ["unique_key","complaint","created_date","closed_date"]



In [6]:

    
df['time'] = df['closed_date'] - df['created_date']



In [7]:

    
df.head()









    Out[7]:






  
    
      
      unique_key
      complaint
      created_date
      closed_date
      time
    
  
  
    
      0
      32570549
      Traffic Signal Condition
      2016-02-01 00:12:00
      2016-02-01 01:15:00
      01:03:00
    
    
      1
      32572958
      Traffic Signal Condition
      2016-02-01 00:14:00
      2016-02-01 00:54:00
      00:40:00
    
    
      2
      32573576
      Traffic Signal Condition
      2016-02-01 00:15:00
      2016-02-01 01:30:00
      01:15:00
    
    
      3
      32572389
      Traffic Signal Condition
      2016-02-01 00:17:00
      2016-02-01 02:35:00
      02:18:00
    
    
      4
      32570548
      Traffic Signal Condition
      2016-02-01 00:19:00
      2016-02-01 01:00:00
      00:41:00



In [8]:

    
df['time'].describe()









    Out[8]:





count                       5650
mean      6 days 15:05:05.589380
std      14 days 12:05:38.260805
min           -19 days +09:29:00
25%              0 days 01:14:00
50%              0 days 21:48:15
75%       4 days 00:30:48.500000
max             89 days 18:54:00
Name: time, dtype: object



In [9]:

    
# clean the dirty data
def to_days(x):
    try:
        return x.days
    except:
        return 0
df['open_time'] = df['time'].apply(to_days)



In [10]:

    
df2 = df[df['open_time']>0]



In [11]:

    
df2['open_time'].describe()
#mean                      14.333333
#standard deviation        18.007218
#min                       1.000000
#25%                       1.000000
#median                    4.000000
#75%                       28.000000
#max                       89.000000









    Out[11]:





count    2607.000000
mean       14.333333
std        18.007218
min         1.000000
25%         1.000000
50%         4.000000
75%        28.000000
max        89.000000
Name: open_time, dtype: float64



In [12]:

    
#range
df2['open_time'].max() - df2['open_time'].min()









    Out[12]:





88



In [13]:

    
#iqr
iqr = df2['open_time'].quantile(q=0.75)- df2['open_time'].quantile(q=0.25)
iqr









    Out[13]:





27.0



In [14]:

    
#ual
ual = df2['open_time'].quantile(q=0.75) + (iqr * 1.5)
ual









    Out[14]:





68.5



In [15]:

    
#lal
df2['open_time'].quantile(q=0.25) - (iqr*1.5) 
# then open days can't be negative number









    Out[15]:





-39.5



In [16]:

    
#check the outlier
outlier = df[df['open_time']>ual]



In [17]:

    
outlier.describe()









    Out[17]:






  
    
      
      time
      open_time
    
  
  
    
      count
      33
      33.000000
    
    
      mean
      78 days 14:22:40.303030
      78.030303
    
    
      std
      6 days 14:42:00.990269
      6.659039
    
    
      min
      69 days 00:41:00
      69.000000
    
    
      25%
      71 days 23:58:00
      71.000000
    
    
      50%
      78 days 18:49:00
      78.000000
    
    
      75%
      84 days 03:08:00
      84.000000
    
    
      max
      89 days 18:54:00
      89.000000



In [ ]:

	unique_key	complaint	created_date	closed_date	time
0	32570549	Traffic Signal Condition	2016-02-01 00:12:00	2016-02-01 01:15:00	01:03:00
1	32572958	Traffic Signal Condition	2016-02-01 00:14:00	2016-02-01 00:54:00	00:40:00
2	32573576	Traffic Signal Condition	2016-02-01 00:15:00	2016-02-01 01:30:00	01:15:00
3	32572389	Traffic Signal Condition	2016-02-01 00:17:00	2016-02-01 02:35:00	02:18:00
4	32570548	Traffic Signal Condition	2016-02-01 00:19:00	2016-02-01 01:00:00	00:41:00

	time	open_time
count	33	33.000000
mean	78 days 14:22:40.303030	78.030303
std	6 days 14:42:00.990269	6.659039
min	69 days 00:41:00	69.000000
25%	71 days 23:58:00	71.000000
50%	78 days 18:49:00	78.000000
75%	84 days 03:08:00	84.000000
max	89 days 18:54:00	89.000000