In [10]:
import pandas as pd
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', port=5432, database='training', user='dot_student', password='qgis')
In [11]:
cursor = conn.cursor()
In [12]:
cursor.execute("SELECT * FROM dot_311")
In [13]:
dataset = []
for record in cursor.fetchall():
dataset.append(record)
In [14]:
df = pd.DataFrame(dataset)
In [18]:
conn.rollback()
In [16]:
df.head()
Out[16]:
In [24]:
df.columns
Out[24]:
In [28]:
selection = df[[1,53,54]]
selection.columns = ("unique_key", "created_date", "closed_date")
In [30]:
selection.head()
Out[30]:
In [38]:
selection['closed_date'] - selection['created_date']
Out[38]:
In [41]:
selection['time_open'] = selection['closed_date'] - selection['created_date']
In [43]:
selection
Out[43]:
In [44]:
selection['time_open'].mean()
Out[44]:
In [45]:
selection['time_open'].median()
Out[45]:
In [47]:
range_time_open = max(selection['time_open']) - min(selection['time_open'])
In [48]:
range_time_open
Out[48]:
In [49]:
selection['time_open'].std()
Out[49]:
In [51]:
selection['time_open'].quantile(q=0.25)
Out[51]:
In [52]:
selection['time_open'].quantile(q=0.5)
Out[52]:
In [53]:
selection['time_open'].quantile(q=0.75)
Out[53]:
In [55]:
iqr = selection['time_open'].quantile(q=0.75) - selection['time_open'].quantile(q=0.25)
iqr
Out[55]:
In [57]:
#check for outliers
In [59]:
ual = selection['time_open'].quantile(q=0.75) + (iqr * 1.5)
ual
Out[59]:
In [60]:
lal = selection['time_open'].quantile(q=0.25) - (iqr * 1.5)
lal
Out[60]:
In [ ]: