In [1]:
import pandas as pd
import pg8000
In [2]:
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')
In [3]:
cursor = conn.cursor()
cursor.execute("select column_name from information_schema.columns where table_name='dot_311'")
column_name = []
for row in cursor.fetchall():
column_name.append(row[0])
print(column_name)
In [4]:
cursor = conn.cursor()
cursor.execute("SELECT unique_key,complaint,created_date,closed_date from dot_311")
output = []
for row in cursor.fetchall():
output.append(row)
In [5]:
df = pd.DataFrame(output)
df.columns = ["unique_key","complaint","created_date","closed_date"]
In [6]:
df['time'] = df['closed_date'] - df['created_date']
In [7]:
df.head()
Out[7]:
In [8]:
df['time'].describe()
Out[8]:
In [9]:
# clean the dirty data
def to_days(x):
try:
return x.days
except:
return 0
df['open_time'] = df['time'].apply(to_days)
In [10]:
df2 = df[df['open_time']>0]
In [11]:
df2['open_time'].describe()
#mean 14.333333
#standard deviation 18.007218
#min 1.000000
#25% 1.000000
#median 4.000000
#75% 28.000000
#max 89.000000
Out[11]:
In [12]:
#range
df2['open_time'].max() - df2['open_time'].min()
Out[12]:
In [13]:
#iqr
iqr = df2['open_time'].quantile(q=0.75)- df2['open_time'].quantile(q=0.25)
iqr
Out[13]:
In [14]:
#ual
ual = df2['open_time'].quantile(q=0.75) + (iqr * 1.5)
ual
Out[14]:
In [15]:
#lal
df2['open_time'].quantile(q=0.25) - (iqr*1.5)
# then open days can't be negative number
Out[15]:
In [16]:
#check the outlier
outlier = df[df['open_time']>ual]
In [17]:
outlier.describe()
Out[17]:
In [ ]: