In [1]:
import pandas as pd
import pg8000
from datetime import datetime
import numpy as np
import matplotlib
%matplotlib inline
In [2]:
# Set up the database
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', port=5432, database='training', user='dot_student', password='qgis')
In [3]:
# Rollback database connection in case of problems
conn.rollback()
In [4]:
# Getting the data from the database
cursor = conn.cursor()
db = list()
cursor.execute("SELECT * FROM dot_311")
for item in cursor.fetchall():
db.append(item)
In [5]:
# Getting the headers from the database
cursor.execute("SELECT column_name FROM information_schema.columns WHERE table_name='dot_311'")
header = list()
for item in cursor.fetchall():
header.append(item[0])
In [6]:
# Creating the Pandas Dataframe
df = pd.DataFrame(data=db)
In [7]:
# Substituting the column names
df.columns = header
In [8]:
df.head()
Out[8]:
In [9]:
df[['created_date', 'closed_date',]].head(5)
Out[9]:
In [10]:
# Creating a new column with the time elapsed betweedn creation of the ticket and its closing.
df['time_elapsed'] = df['closed_date'] - df['created_date']
df[['created_date', 'closed_date', 'time_elapsed']].head(5)
Out[10]:
In [13]:
# It takes time to process a request. Unfortunately, there are negative values in the dataset. That's not possible in reality...
# Let's check how many times are negative.
df[df['time_elapsed'] < '00:00:00']['unique_key'].count()
Out[13]:
In [12]:
# Let's check how many items with 'normal' timestamps are in the dataset.
df['unique_key'].count()
Out[12]:
In [219]:
# There are also a lot of tickets that have been opened and closed at the same time.
df[df['time_elapsed'] == '00:00:00']['unique_key'].count()
Out[219]:
In [220]:
# What percentage of the times are wrong?
100 * df[df['time_elapsed'] <= '00:00:00']['unique_key'].count() / df['unique_key'].count()
Out[220]:
In [53]:
# Every fith entry seems to be strange. That is a lot. If the wrong ones are filled randomly,
# we could go on. If most of them are from the same department or about the same topic,
# we would have to think about it.
print("This are all complaint categories:\n\n", df['complaint'].value_counts())
wrong = df[df['time_elapsed'] <= '00:00:00']
print("This are the categories affected by non plausible values:\n\n", wrong['complaint'].value_counts(), "\n\n")
print("In the category Street Light Condition", 100 * wrong[wrong['complaint'] == 'Street Light Condition']['complaint'].value_counts() / df[df['complaint'] == 'Street Light Condition']['complaint'].value_counts(), "percent are wrong.")
In [56]:
# About half of the entries about "Street Light Condition" is wrong. Let's exclude these from our analysis.
# We also exclude the other wrong data. We need to mention it in our article, though.
df_plausible = df[df['complaint'] != 'Street Light Condition']
df_plausible = df_plausible[df_plausible['time_elapsed'] > '00:00:00']
df.head(3)
Out[56]:
In [57]:
# To be able to plot it, we will add a new column with the time converted to minutes.
df_plausible['time_elapsed_minutes'] = df_plausible['time_elapsed'].apply(lambda x: x / np.timedelta64(1,'m'))
df_plausible[['time_elapsed', 'time_elapsed_minutes']].head()
Out[57]:
In [80]:
# Let's get an overview by plotting it.
ax = df_plausible['time_elapsed_minutes'].hist(bins=60)
ax.set_title("The city takes little time to work on complaints")
ax.set_xlabel("Time in Minutes")
ax.set_ylabel("Number of Complaints")
Out[80]:
In [82]:
# How long does it them take to close a complaint? Let's check the mean, the quartiles, the min and max.
df_plausible['time_elapsed'].describe()
Out[82]:
In [83]:
# Since we don't nearly havee a bell shaped curve, we probably better use the median instead of the mean.
df_plausible['time_elapsed'].median()
Out[83]:
In [84]:
# Let's just look at the complaints that were closed in the first hour after their opening.
df_plausible[df_plausible['time_elapsed_minutes'] < 60]['time_elapsed_minutes'].hist(bins=60)
Out[84]:
In [87]:
# Let's check what types of complaints take the longest and the shortest to getting closed.
df_plausible.groupby(by='complaint')['time_elapsed_minutes'].median().sort_values()
Out[87]:
In [96]:
# Let's plot that.
df_plausible.groupby(by='complaint')['time_elapsed_minutes'].median().sort_values(ascending=False).plot(kind='barh', legend=False)
Out[96]:
In [101]:
df_plausible[df_plausible['complaint'] == 'Traffic Signal Condition']['time_elapsed'].median()
Out[101]:
In [102]:
df_plausible[df_plausible['complaint'] == 'Public Toilet']['time_elapsed'].median()
Out[102]:
In [103]:
df_plausible[df_plausible['complaint'] == 'Broken Parking Meter']['time_elapsed'].median()
Out[103]: