In [2]:
import pg8000
from pandas import DataFrame
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com",port=5432,database="training",user="dot_student",password="qgis")
In [3]:
conn.rollback()
In [4]:
cursor = conn.cursor()
In [5]:
database=cursor.execute("SELECT * FROM dot_311")
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_sql("SELECT * FROM dot_311", conn)
In [6]:
df.head()
Out[6]:
In [7]:
df=df.rename(columns = lambda x : str(x)[1:])
df.columns = [c.replace('\'', '') for c in df.columns]
In [11]:
df.columns
Out[11]:
In [12]:
df['duration']=df['closed_date']-df['created_date']
In [13]:
df['duration'].head()
Out[13]:
In [14]:
## AVERAGE TIME TAKEN TO DEAL WITH A COMPLAINT :
df['duration'].mean()
Out[14]:
But we notice that there are discrepancies in the data. For example:
In [15]:
df['duration'].max()
Out[15]:
In [16]:
df['duration'].min()
Out[16]:
There are complaints that take negative days! So it is essential we see the median, so that outliers like these don't affect our understanding.
In [ ]:
In [17]:
df['duration'].median()
Out[17]:
So on an average, the DOT takes a little under a day to deal with most complaints.
In [18]:
df['duration'].mode()
Out[18]:
The range of the dataframe is:
In [19]:
df['duration'].max()-df['duration'].min()
Out[19]:
Now the quartiles of the data.
In [20]:
df['duration'].quantile(q=0.25)
Out[20]:
In [21]:
df['duration'].quantile(q=0.50)
Out[21]:
In [22]:
df['duration'].quantile(q=0.75)
Out[22]:
In [23]:
df['duration'].std()
Out[23]:
In [26]:
df['duration'].quantile(q=0.75) - df['duration'].quantile(q=0.25) #IQR
Out[26]:
In [29]:
1.5 * (df['duration'].quantile(q=0.75) - df['duration'].quantile(q=0.25))
Out[29]:
So Five Days and 22 Hours Above or Below the IQR,lie the outliers.
In [ ]:
In [ ]: