In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
%matplotlib inline
In [2]:
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')
cursor = conn.cursor()
In [3]:
# Getting the column names
statement = "select * from INFORMATION_SCHEMA.COLUMNS where table_name = 'dot_311'"
cursor.execute(statement)
columns = []
for row in cursor.fetchall():
columns.append(row[3])
print(*columns)
In [4]:
statement = "SELECT unique_key, complaint, created_date, closed_date FROM dot_311"
cursor.execute(statement)
complaints = []
for row in cursor:
complaints.append(row)
In [5]:
for complaint in complaints:
try:
complaint.append(complaint[3] - complaint[2])
except:
complaint.append(None)
In [6]:
df = pd.DataFrame(complaints)
df.columns = ['key', 'complaint', 'created_date', 'closed_date', 'time']
In [7]:
df.describe()
Out[7]:
In [8]:
from datetime import timedelta
from math import nan
no_delta_count = 0
def delta_to_seconds(delta):
if not isinstance(delta, timedelta):
global no_delta_count
no_delta_count = no_delta_count + 1
return nan
else:
return timedelta.total_seconds(delta)
df['time_seconds'] = df['time'].apply(delta_to_seconds)
print(no_delta_count, "values out of", len(df), "weren't time deltas.")
In [9]:
df['time_seconds'].hist(bins=100, figsize=(30, 6))
Out[9]:
In [10]:
df[['time_seconds']].boxplot(return_type='axes')
Out[10]:
In [11]:
# The record time is 89 days
df.iloc[df[['time_seconds']].idxmax()]
Out[11]:
In [12]:
# And one request was recorded as closed 19 days *before* being created
df.iloc[df[['time_seconds']].idxmin()]
Out[12]:
In [13]:
# We'll now study the appropriate times
df_positive_time = df[df['time_seconds'] > 0]
df_positive_time.shape
Out[13]:
In [14]:
df_positive_time.boxplot(return_type='axes')
df_positive_time.hist()
Out[14]:
In [15]:
df_less_11_days = df_positive_time[df_positive_time['time_seconds'] < 1000000]
df_more_11_days = df[df['time_seconds'] > 1000000]
In [16]:
df_more_11_days.boxplot(return_type='axes')
df_more_11_days.hist()
Out[16]:
In [17]:
df_less_11_days.boxplot(return_type='axes')
df_less_11_days.hist()
Out[17]:
In [18]:
df_no_nan = df[df['time_seconds'] == df['time_seconds']]
iqr = df_no_nan['time_seconds'].quantile(q=0.75) - df_no_nan['time_seconds'].quantile(q=0.25)
print("The interquartile is", iqr, "seconds.")
print("Which is:", df['time'].quantile(q=0.75) - df['time'].quantile(q=0.25))
print("The standard deviation is", df['time'].std())
In [19]:
def set_timeframe(time_seconds):
if time_seconds < 0:
return 'Negative'
elif time_seconds == 0:
return 'Zero'
elif time_seconds < 604800:
return 'Less than a week'
elif time_seconds >= 604800:
return 'More than a week'
else:
return 'Undefined'
df['timeframe'] = df['time_seconds'].apply(set_timeframe)
In [20]:
df['timeframe'].value_counts()
Out[20]:
In [21]:
print("Top complaints per timeframe:")
df.groupby('timeframe')['complaint'].value_counts()
Out[21]:
In [23]:
# Let's have a closer look at these four groups
df.groupby('timeframe').boxplot(figsize=(15, 10), return_type='axes')
df.groupby('timeframe').hist()
Out[23]:
In [22]:
# Fun with small multiples!
category = 'timeframe'
categories = df[category].unique().tolist()
nrows = 2; ncols = 2
num_plots = nrows * ncols # number of subplots
fig = plt.figure(figsize=(10, 6))
axes = [plt.subplot(nrows,ncols,i) for i in range(1,num_plots+1)]
plt.tight_layout(pad=0, w_pad=3, h_pad=1)
plt.subplots_adjust(hspace=.5)
for i in range(num_plots):
ax = axes[i]
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
dfx = df[df[category]==categories[i]]#.head(5)
# BAR CHARTS
x = dfx['complaint']
y = dfx['time_seconds']
ax.set_title(categories[i])
ax.bar(left=range(0, len(dfx)), height=dfx['time_seconds'], linewidth=0)