Perform a basic statistical analysis of the time DOT 311 (table is called dot_311) complaints are open (subtract closed date from created date) Connect to the database to get the data and do the analysis. Submit the code through Github and type up your results in your PR
Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com Port: 5432 Database: training Username: dot_student Password: qgis
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
import pg8000
from pandas import DataFrame
In [2]:
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", user='dot_student', password='qgis', database='training')
cursor = conn.cursor()
In [3]:
cursor.execute("select column_name from information_schema.columns where table_name='dot_311'")
column_list = []
for row in cursor.fetchall():
column_list.append(row[0])
column_list
Out[3]:
In [4]:
response = "select * from dot_311"
In [5]:
cursor.execute("select * from dot_311")
output = []
for row in cursor.fetchall():
output.append(row)
In [6]:
df = pd.DataFrame(data=output, columns=column_list)
In [7]:
df.head(3)
Out[7]:
In [8]:
sel_df = df[[
'unique_key',
'agency',
'agency nam',
'created_date',
'closed_date',
'complaint',
'descriptor',
'status',
'due date',
'resolution',
'city',
'borough'
]]
In [9]:
sel_df.head(3)
Out[9]:
In [10]:
sel_df.info()
In [78]:
#dateutil.parser.parse("2016-02-01 00:15:00")
In [60]:
sel_df['ClosedDate'] = pd.to_datetime(sel_df['closed_date'], format='%m/%d/%y %H:%M:%S')
sel_df.info()
In [59]:
#no need to do this step as created date and closed date are already in datetime format
sel_df['CreatedDate'] = pd.to_datetime(sel_df['created_date'], format='%m/%d/%y %H:%M:%S')
sel_df.info()
In [15]:
#def parse_date(str_date):
#return dateutil.parser.parse(str_date)
#df['Created Date'] = df['created_date'].apply(parse_date)
In [79]:
sel_df['delay'] = sel_df['ClosedDate'] - sel_df['CreatedDate']
sel_df['delay'].head(3)
Out[79]:
In [64]:
delay.isnull().value_counts()
Out[64]:
In [65]:
delay_delta = pd.DataFrame(sel_df['delay'][sel_df['delay']> pd.Timedelta(seconds=0)].dropna())
delay_delta.describe()
Out[65]:
In [83]:
delay_delta.head()
Out[83]:
In [66]:
#now all the null values have been removed. great ! now i can test with a dataframe.
delay_delta['delay'].isnull().value_counts()
Out[66]:
In [69]:
#changing the delta value to seconds so that we can plot easily
delay_delta['delay'].astype('timedelta64[s]')
Out[69]:
In [90]:
# plotting time delata in days bucket
delay_delta['delay'].astype('timedelta64[D]').hist(bins=20)
plt.xlabel('Response in Days')
plt.ylabel('Frequency of Resonse')
Out[90]:
In [ ]:
delay_delta['delay'].astype('timedelta64[D]').hist(bins=20)
In [21]:
delay_delta['delay'].head()
Out[21]:
In [84]:
# Now checking the original dataframe sel_df. it has 201 null values which i have to remove.
#sel_df['delay'].isnull().value_counts()
In [85]:
# Don't assign a dataframe to a column after dropping rows because if you append to the original dataframe, the index wont match.
#sel_df['delay'] = pd.DataFrame(sel_df['delay'][sel_df['delay']> pd.Timedelta(seconds=0)].dropna())
#sel_df['delay'].describe()
In [86]:
#pd.DataFrame(sel_df['delay'][sel_df['delay']> pd.Timedelta(seconds=0)])['delay']
In [80]:
#sel_df['delay']
In [81]:
#sel_df['delay'].isnull().value_counts()
In [82]:
sel_df.head(3)
Out[82]:
In [87]:
delay_delta['delay'].head()
Out[87]:
In [88]:
delay_delta['delay'].value_counts().head()
Out[88]:
In [27]:
delay_delta['delay'].value_counts().hist()
plt.xlabel('Delay')
plt.ylabel('Frequency of Delay')
Out[27]:
In [89]:
delay_plot = pd.DataFrame(delay_delta['delay'].value_counts())
delay_plot.head()
Out[89]:
In [93]:
delay_delta['delay'].astype('timedelta64[D]').plot(kind='bar',x=delay_plot.index, y ='delay')
plt.xlabel('Response in Days')
plt.ylabel('Frequency of Resonse')
Out[93]:
In [100]:
delay_delta['delay'].describe()
Out[100]:
In [30]:
delay_plot.quantile(q=0.25)
Out[30]:
In [31]:
delay_plot.quantile(q=0.5)
Out[31]:
In [32]:
delay_plot.quantile(q=0.75)
Out[32]:
In [ ]: