In [189]:
import pg8000
import datetime
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
In [28]:
conn = pg8000.connect(database= 'training', port = 5432, host = 'training.c1erymiua9dx.us-east-1.rds.amazonaws.com', user = 'dot_student', password = 'qgis')
In [56]:
conn.rollback()
In [36]:
cursor = conn.cursor()
What does this data look like?
In [39]:
cursor.execute("""SELECT *
FROM dot_311
LIMIT 1""")
cursor.fetchone()
Out[39]:
What are the column names?
In [46]:
cursor.execute("""SELECT column_name FROM information_schema.columns WHERE table_name='dot_311'""")
cursor.fetchall()
Out[46]:
What does a created and closed date look like?
In [53]:
cursor.execute("""SELECT created_date, closed_date
FROM dot_311
LIMIT 1""")
cursor.fetchone()
Out[53]:
Okay, let's just save the created and closed dates.
In [57]:
cursor.execute("""SELECT created_date, closed_date
FROM dot_311""")
results = cursor.fetchall()
Exploring how to convert them into the length of time the complaints are open.
In [62]:
results[0][0]
Out[62]:
In [63]:
results[0][1]
Out[63]:
In [64]:
results[0][1] - results[0][0]
Out[64]:
In [71]:
dif = results[0][1] - results[0][0]
divmod(dif.days * 86400 + dif.seconds, 60)
# datetime.datetime.strftime(datetime.timedelta(0, 3780), "%H:%M")
Out[71]:
In [79]:
dif = results[0][1] - results[0][0]
divmod(dif.seconds, 60)
Out[79]:
In [117]:
dif = datetime.timedelta(2, 3780)
In [119]:
dif.days * 1440 + dif.seconds/60
Out[119]:
In [73]:
results[:3]
Out[73]:
In [83]:
timelen = []
for created, closed in results[:3]:
print('created', created)
print('closed', closed)
print('')
Okay, let's make a list of how many minutes each one of these is open!
In [120]:
timelen = []
for created, closed in results:
if created and closed:
dif = closed - created
timelen.append(dif.days * 1440 + dif.seconds/60)
201 or 1.7% of the items didn't have both a closed and created time
In [121]:
len(results) - len(timelen)
Out[121]:
In [122]:
201 / (len(results) + len(timelen))
Out[122]:
And at least 1103 of the results are probably not accurate.
In [123]:
num = 0
for time in timelen:
if time <= 0:
num += 1
print(num)
In [124]:
sorted(timelen)[:20]
Out[124]:
Let's turn this into a Pandas DataFrame.
In [125]:
df = pd.DataFrame({'timelen': timelen})
Okay, let's make a histogram showing how many incidents happen.
In [131]:
df.hist(bins = 28, range = (-120, 1560))
Out[131]:
Looks like those 1103 0-minute cases are all outliers that are going to mess things up.
In [132]:
df.mean()
Out[132]:
In [157]:
df[df['timelen'] >= 0].mean()
Out[157]:
The mean is a LOT higher than the median, which suggests the data is skewed.
In [133]:
df.median()
Out[133]:
In [158]:
df[df['timelen'] >= 0].median()
Out[158]:
In [134]:
df.mode()
Out[134]:
In [159]:
df[df['timelen'] >= 0].mode()
Out[159]:
In [164]:
max(timelen) - min(timelen)
Out[164]:
In [161]:
df[df['timelen'] >= 0].max() - df[df['timelen'] >= 0].min()
Out[161]:
In [163]:
df.std()
Out[163]:
In [162]:
df[df['timelen'] >= 0].std()
Out[162]:
If you think outliers are numbers three standard deviations away from the mean, there are 68 outliers, but it doesn't catch all of the negative values.
In [137]:
df.mean() + df.std() * 3
Out[137]:
In [138]:
df.mean() - df.std() * 3
Out[138]:
In [178]:
std_outliers = df[(df['timelen'] < -53111.819884) | (df['timelen'] > 72202.006197)]
In [179]:
len(std_outliers)
Out[179]:
In [165]:
std_outliers.head()
Out[165]:
In [172]:
df[df['timelen'] >= 0].mean() + df[df['timelen'] >= 0].std() * 3
Out[172]:
In [173]:
df[df['timelen'] >= 0].mean() - df[df['timelen'] >= 0].std() * 3
Out[173]:
In [175]:
std_outliers = df[df['timelen'] > 75447.537602]
In [176]:
len(std_outliers)
Out[176]:
In [177]:
std_outliers.head()
Out[177]:
In [145]:
IQR = df.quantile(q=0.75) - df.quantile(q=0.25)
In [146]:
IQR
Out[146]:
In [181]:
IQR2 = df[df['timelen'] >= 0].quantile(q=0.75) - df[df['timelen'] >= 0].quantile(q=0.25)
In [182]:
IQR2
Out[182]:
If you think outliers are 1.5 times the interquartile range above the 3rd quartile or below the 1st quartile, there are 1067 outliers, more than with the standard deviation outliers, but still not getting rid of the negative values.
In [152]:
df.quantile(q=0.25) - 1.5 * IQR
Out[152]:
In [153]:
df.quantile(q=0.75) + 1.5 * IQR
Out[153]:
In [154]:
IQR_outliers = df[(df['timelen'] < -8501.2125) | (df['timelen'] > 14366.020833)]
In [155]:
len(IQR_outliers)
Out[155]:
In [183]:
IQR_outliers.head()
Out[183]:
In [184]:
df[df['timelen'] >= 0].quantile(q=0.25) - 1.5 * IQR2
Out[184]:
In [185]:
df[df['timelen'] >= 0].quantile(q=0.75) + 1.5 * IQR2
Out[185]:
In [186]:
IQR_outliers2 = df[(df['timelen'] > 14366.020833)]
In [187]:
len(IQR_outliers2)
Out[187]:
In [188]:
IQR_outliers2.head()
Out[188]:
In [195]:
plt.boxplot(df['timelen'])
Out[195]:
In [197]:
In [199]:
plt.boxplot(df[df['timelen'] >= 0]['timelen'])
Out[199]:
In [ ]: