Assignment #1: Perform a basic statistical analysis of the time DOT 311 (table is called dot_311) complaints are open (subtract closed date from created date). Connect to the database to get the data and do the analysis.
In [492]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
from datetime import timedelta
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')
In [495]:
cursor = conn.cursor()
statement = "SELECT * FROM information_schema.columns WHERE table_name= 'dot_311'"
cursor.execute(statement)
column_names = []
for row in cursor.fetchall():
column_names.append(row[3])
print(column_names)
In [239]:
statement = "SELECT complaint, agency, created_date, closed_date FROM dot_311"
cursor.execute(statement)
complaint_data = []
for row in cursor:
complaint_data.append(row)
In [352]:
df = pd.DataFrame(complaint_data)
df.columns = ['complaint', 'agency', 'created_date', 'closed_date']
df.head(3)
Out[352]:
In [353]:
pd.value_counts(df['complaint'].values, sort=True)
Out[353]:
The most frequent complaints were for street light, street, and traffic signal conditions.
In [465]:
date1 = df['closed_date']
date2 = df['created_date']
timeopen = date1 - date2
df1 = pd.DataFrame(timeopen)
df1.columns = ['Time Taken']
df1.describe()
Out[465]:
In [375]:
print("The smallest time taken to respond to a noise complaint was, miraculously: " + str(c.min()))
print("The longest amount of time was: " + str(df1['Time Taken'].max()))
print("The average time was: " + str(df1['Time Taken'].mean()))
In [496]:
df2 = pd.concat([df, df1], axis=1)
df2.columns = ['Complaint', 'Agency', 'Created Date', 'Closed Date', 'Time Taken']
df3 = bdf[pd.notnull(df2['Time Taken'])]
df3.sort_values(by='Time Taken', ascending=False).head(3)
Out[496]:
In [471]:
timeopen.astype('timedelta64[h]')
a = (timeopen / pd.Timedelta(hours=1)).dropna()
plottable_dates = pd.DataFrame(a)
plottable_dates.describe()
Out[471]:
In [489]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1,)
n, bins, patches = ax.hist(plottable_dates, bins=50, range=(-450, 2155), histtype='bar')
for patch in patches:
patch.set_facecolor('darkgrey')
plt.title('Time Taken to Respond to 311 Requests')
plt.xlabel('Time (in hours)')
Out[489]: