In [1]:
import pandas as pd
import pg8000

In [2]:
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')

In [3]:
cursor = conn.cursor()
cursor.execute("select column_name from information_schema.columns where table_name='dot_311'")
column_name = []
for row in cursor.fetchall():
    column_name.append(row[0])
print(column_name)


['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor', 'location t', 'incident z', 'incident a', 'street nam', 'cross stre', 'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city', 'landmark', 'facility t', 'status', 'due date', 'resolution', 'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina', 'park facil', 'park borou', 'school nam', 'school num', 'school reg', 'school cod', 'school pho', 'school add', 'school cit', 'school sta', 'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa', 'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2', 'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude', 'location', 'geom', 'created_date', 'closed_date']

In [4]:
cursor = conn.cursor()
cursor.execute("SELECT unique_key,complaint,created_date,closed_date from dot_311")
output = []
for row in cursor.fetchall():
    output.append(row)

In [5]:
df = pd.DataFrame(output)
df.columns = ["unique_key","complaint","created_date","closed_date"]

In [6]:
df['time'] = df['closed_date'] - df['created_date']

In [7]:
df.head()


Out[7]:
unique_key complaint created_date closed_date time
0 32570549 Traffic Signal Condition 2016-02-01 00:12:00 2016-02-01 01:15:00 01:03:00
1 32572958 Traffic Signal Condition 2016-02-01 00:14:00 2016-02-01 00:54:00 00:40:00
2 32573576 Traffic Signal Condition 2016-02-01 00:15:00 2016-02-01 01:30:00 01:15:00
3 32572389 Traffic Signal Condition 2016-02-01 00:17:00 2016-02-01 02:35:00 02:18:00
4 32570548 Traffic Signal Condition 2016-02-01 00:19:00 2016-02-01 01:00:00 00:41:00

In [8]:
df['time'].describe()


Out[8]:
count                       5650
mean      6 days 15:05:05.589380
std      14 days 12:05:38.260805
min           -19 days +09:29:00
25%              0 days 01:14:00
50%              0 days 21:48:15
75%       4 days 00:30:48.500000
max             89 days 18:54:00
Name: time, dtype: object

In [9]:
# clean the dirty data
def to_days(x):
    try:
        return x.days
    except:
        return 0
df['open_time'] = df['time'].apply(to_days)

In [10]:
df2 = df[df['open_time']>0]

In [11]:
df2['open_time'].describe()
#mean                      14.333333
#standard deviation        18.007218
#min                       1.000000
#25%                       1.000000
#median                    4.000000
#75%                       28.000000
#max                       89.000000


Out[11]:
count    2607.000000
mean       14.333333
std        18.007218
min         1.000000
25%         1.000000
50%         4.000000
75%        28.000000
max        89.000000
Name: open_time, dtype: float64

In [12]:
#range
df2['open_time'].max() - df2['open_time'].min()


Out[12]:
88

In [13]:
#iqr
iqr = df2['open_time'].quantile(q=0.75)- df2['open_time'].quantile(q=0.25)
iqr


Out[13]:
27.0

In [14]:
#ual
ual = df2['open_time'].quantile(q=0.75) + (iqr * 1.5)
ual


Out[14]:
68.5

In [15]:
#lal
df2['open_time'].quantile(q=0.25) - (iqr*1.5) 
# then open days can't be negative number


Out[15]:
-39.5

In [16]:
#check the outlier
outlier = df[df['open_time']>ual]

In [17]:
outlier.describe()


Out[17]:
time open_time
count 33 33.000000
mean 78 days 14:22:40.303030 78.030303
std 6 days 14:42:00.990269 6.659039
min 69 days 00:41:00 69.000000
25% 71 days 23:58:00 71.000000
50% 78 days 18:49:00 78.000000
75% 84 days 03:08:00 84.000000
max 89 days 18:54:00 89.000000

In [ ]: