In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
%matplotlib inline
In [12]:
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')
#cursor = conn.cursor()
In [20]:
df = pd.read_sql("select * from noise_311", conn)
In [21]:
df['time'] = df[b'closed_date'] - df[b'created_date']
In [22]:
df.describe()
Out[22]:
In [23]:
from datetime import timedelta
from math import nan
no_delta_count = 0
def delta_to_seconds(delta):
if not isinstance(delta, timedelta):
global no_delta_count
no_delta_count = no_delta_count + 1
return nan
else:
return timedelta.total_seconds(delta)
df['time_seconds'] = df['time'].apply(delta_to_seconds)
print(no_delta_count, "values out of", len(df), "weren't time deltas.")
In [25]:
df['time_seconds'].hist(bins=30, figsize=(30, 6))
Out[25]:
In [26]:
df[['time_seconds']].boxplot(return_type='axes')
Out[26]:
In [47]:
# The maximum time
print("The maximum time is", df['time'].max())
df.iloc[df[['time']].idxmax()]
Out[47]:
In [48]:
# Minimum time
print("The minimum time is", df['time'].min())
df.iloc[df[['time_seconds']].idxmin()]
Out[48]:
In [54]:
df.boxplot(column='time_seconds', return_type='axes')
df['time_seconds'].hist()
Out[54]:
In [19]:
def set_timeframe(time_seconds):
if time_seconds < 0:
return 'Negative'
elif time_seconds == 0:
return 'Zero'
elif time_seconds < 604800:
return 'Less than a week'
elif time_seconds >= 604800:
return 'More than a week'
else:
return 'Undefined'
df['timeframe'] = df['time_seconds'].apply(set_timeframe)
In [20]:
df['timeframe'].value_counts()
Out[20]:
In [21]:
print("Top complaints per timeframe:")
df.groupby('timeframe')['complaint'].value_counts()
Out[21]:
In [23]:
# Let's have a closer look at these four groups
df.groupby('timeframe').boxplot(figsize=(15, 10), return_type='axes')
df.groupby('timeframe').hist()
Out[23]:
In [22]:
# Fun with small multiples!
category = 'timeframe'
categories = df[category].unique().tolist()
nrows = 2; ncols = 2
num_plots = nrows * ncols # number of subplots
fig = plt.figure(figsize=(10, 6))
axes = [plt.subplot(nrows,ncols,i) for i in range(1,num_plots+1)]
plt.tight_layout(pad=0, w_pad=3, h_pad=1)
plt.subplots_adjust(hspace=.5)
for i in range(num_plots):
ax = axes[i]
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
dfx = df[df[category]==categories[i]]#.head(5)
# BAR CHARTS
x = dfx['complaint']
y = dfx['time_seconds']
ax.set_title(categories[i])
ax.bar(left=range(0, len(dfx)), height=dfx['time_seconds'], linewidth=0)