Query the training database you used in homework 4-1 and query the noise_311 table Perform a basic statistical analysis of the time a complaint is left open Compare the results to what you found for DOT complaints. Submit this as Do Now 5
Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com
Port: 5432
Database: training
Username: dot_student
Password: qgis
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
import pg8000
from pandas import DataFrame
In [ ]:
conn.re
In [19]:
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", user='dot_student', password='qgis', database='training')
cursor = conn.cursor()
In [20]:
cursor.execute("select column_name from information_schema.columns where table_name='noise_311'")
column_list = []
for row in cursor.fetchall():
column_list.append(row[0])
column_list
Out[20]:
In [21]:
response = "select * from noise_311"
In [22]:
cursor.execute("select * from noise_311")
output = []
for row in cursor.fetchall():
output.append(row)
In [23]:
df = pd.DataFrame(data=output, columns=column_list)
In [24]:
df.head(3)
Out[24]:
In [27]:
sel_df = df[[
'unique_key',
'agency',
'agency_name',
'created_date',
'closed_date',
'complaint_type',
'descriptor',
'status',
'due_date',
'resolution_description',
'city',
'borough'
]]
In [28]:
sel_df.head(3)
Out[28]:
In [29]:
sel_df.info()
In [78]:
#dateutil.parser.parse("2016-02-01 00:15:00")
In [30]:
sel_df['ClosedDate'] = pd.to_datetime(sel_df['closed_date'], format='%m/%d/%y %H:%M:%S')
sel_df.info()
In [31]:
#no need to do this step as created date and closed date are already in datetime format
sel_df['CreatedDate'] = pd.to_datetime(sel_df['created_date'], format='%m/%d/%y %H:%M:%S')
sel_df.info()
In [32]:
sel_df['delay'] = sel_df['ClosedDate'] - sel_df['CreatedDate']
sel_df['delay'].head(3)
Out[32]:
In [34]:
sel_df['delay'].isnull().value_counts()
Out[34]:
In [35]:
delay_delta = pd.DataFrame(sel_df['delay'][sel_df['delay']> pd.Timedelta(seconds=0)].dropna())
delay_delta.describe()
Out[35]:
In [36]:
delay_delta.head()
Out[36]:
In [37]:
#now all the null values have been removed. great ! now i can test with a dataframe.
delay_delta['delay'].isnull().value_counts()
Out[37]:
In [41]:
#changing the delta value to seconds so that we can plot easily
delay_delta['delay'].astype('timedelta64[m]')
Out[41]:
In [43]:
# plotting time delata in days bucket
delay_delta['delay'].astype('timedelta64[m]').hist(bins=20)
plt.xlabel('Response in Minutes')
plt.ylabel('Frequency of Resonse')
Out[43]:
In [44]:
# plotting time delata in days bucket
delay_delta['delay'].astype('timedelta64[D]').hist(bins=20)
plt.xlabel('Response in Minutes')
plt.ylabel('Frequency of Resonse')
Out[44]:
In [21]:
delay_delta['delay'].head()
Out[21]:
In [46]:
sel_df.head(3)
Out[46]:
In [47]:
delay_delta['delay'].head()
Out[47]:
In [48]:
delay_delta['delay'].value_counts().head()
Out[48]:
In [49]:
delay_delta['delay'].value_counts().hist()
plt.xlabel('Delay')
plt.ylabel('Frequency of Delay')
Out[49]:
In [51]:
delay_plot = pd.DataFrame(delay_delta['delay'].value_counts())
delay_plot.head()
Out[51]:
In [51]:
delay_delta['delay'].describe()
Out[51]:
In [52]:
delay_delta['delay'].quantile(q=0.25)
Out[52]:
In [53]:
delay_delta['delay'].quantile(q=0.5)
Out[53]:
In [54]:
delay_delta['delay'].quantile(q=0.75)
Out[54]:
In [ ]: