In [189]:
import pg8000
import datetime
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [28]:
conn = pg8000.connect(database= 'training', port = 5432, host = 'training.c1erymiua9dx.us-east-1.rds.amazonaws.com', user = 'dot_student', password = 'qgis')

In [56]:
conn.rollback()

In [36]:
cursor = conn.cursor()

What does this data look like?


In [39]:
cursor.execute("""SELECT * 
    FROM dot_311 
    LIMIT 1""")
cursor.fetchone()


Out[39]:
[2,
 Decimal('32570549'),
 'DOT',
 'Department of Transportation',
 'Traffic Signal Condition',
 'Controller',
 None,
 Decimal('10301'),
 None,
 None,
 None,
 None,
 'VICTORY BOULEVARD',
 'LITTLE CLOVE ROAD',
 'INTERSECTION',
 'STATEN ISLAND',
 None,
 'N/A',
 'Closed',
 None,
 None,
 '02/01/2016 01:15:00 AM',
 '01 STATEN ISLAND',
 'STATEN ISLAND',
 Decimal('952773'),
 Decimal('162922'),
 'Unspecified',
 'STATEN ISLAND',
 'Unspecified',
 'Unspecified',
 'Unspecified',
 'Unspecified',
 'Unspecified',
 'Unspecified',
 'Unspecified',
 'Unspecified',
 'Unspecified',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 Decimal('40.613803753157917'),
 Decimal('-74.113373738864198'),
 '(40.61380375315792, -74.1133737388642)',
 '0101000020E61000006829ED83418752C0D7FC121F914E4440',
 datetime.datetime(2016, 2, 1, 0, 12),
 datetime.datetime(2016, 2, 1, 1, 15)]

What are the column names?


In [46]:
cursor.execute("""SELECT column_name FROM information_schema.columns WHERE table_name='dot_311'""")
cursor.fetchall()


Out[46]:
(['gid'],
 ['unique_key'],
 ['agency'],
 ['agency nam'],
 ['complaint'],
 ['descriptor'],
 ['location t'],
 ['incident z'],
 ['incident a'],
 ['street nam'],
 ['cross stre'],
 ['cross st_1'],
 ['intersecti'],
 ['intersec_1'],
 ['address ty'],
 ['city'],
 ['landmark'],
 ['facility t'],
 ['status'],
 ['due date'],
 ['resolution'],
 ['resoluti_1'],
 ['community'],
 ['borough'],
 ['x coordina'],
 ['y coordina'],
 ['park facil'],
 ['park borou'],
 ['school nam'],
 ['school num'],
 ['school reg'],
 ['school cod'],
 ['school pho'],
 ['school add'],
 ['school cit'],
 ['school sta'],
 ['school zip'],
 ['school not'],
 ['school or'],
 ['vehicle ty'],
 ['taxi compa'],
 ['taxi pick'],
 ['bridge hig'],
 ['bridge h_1'],
 ['road ramp'],
 ['bridge h_2'],
 ['garage lot'],
 ['ferry dire'],
 ['ferry term'],
 ['latitude'],
 ['longitude'],
 ['location'],
 ['geom'],
 ['created_date'],
 ['closed_date'])

What does a created and closed date look like?


In [53]:
cursor.execute("""SELECT created_date, closed_date 
    FROM dot_311 
    LIMIT 1""")
cursor.fetchone()


Out[53]:
([datetime.datetime(2016, 2, 1, 0, 12), datetime.datetime(2016, 2, 1, 1, 15)],)

Okay, let's just save the created and closed dates.


In [57]:
cursor.execute("""SELECT created_date, closed_date 
    FROM dot_311""")
results = cursor.fetchall()

Exploring how to convert them into the length of time the complaints are open.


In [62]:
results[0][0]


Out[62]:
datetime.datetime(2016, 2, 1, 0, 12)

In [63]:
results[0][1]


Out[63]:
datetime.datetime(2016, 2, 1, 1, 15)

In [64]:
results[0][1] - results[0][0]


Out[64]:
datetime.timedelta(0, 3780)

In [71]:
dif = results[0][1] - results[0][0]
divmod(dif.days * 86400 + dif.seconds, 60)
# datetime.datetime.strftime(datetime.timedelta(0, 3780), "%H:%M")


Out[71]:
(63, 0)

In [79]:
dif = results[0][1] - results[0][0]
divmod(dif.seconds, 60)


Out[79]:
(63, 0)

In [117]:
dif = datetime.timedelta(2, 3780)

In [119]:
dif.days * 1440 + dif.seconds/60


Out[119]:
2943.0

In [73]:
results[:3]


Out[73]:
([datetime.datetime(2016, 2, 1, 0, 12), datetime.datetime(2016, 2, 1, 1, 15)],
 [datetime.datetime(2016, 2, 1, 0, 14), datetime.datetime(2016, 2, 1, 0, 54)],
 [datetime.datetime(2016, 2, 1, 0, 15), datetime.datetime(2016, 2, 1, 1, 30)])

In [83]:
timelen = []
for created, closed in results[:3]:
    print('created', created)
    print('closed', closed)
    print('')


created 2016-02-01 00:12:00
closed 2016-02-01 01:15:00

created 2016-02-01 00:14:00
closed 2016-02-01 00:54:00

created 2016-02-01 00:15:00
closed 2016-02-01 01:30:00

Okay, let's make a list of how many minutes each one of these is open!


In [120]:
timelen = []
for created, closed in results:
    if created and closed:
        dif = closed - created
        timelen.append(dif.days * 1440 + dif.seconds/60)

201 or 1.7% of the items didn't have both a closed and created time


In [121]:
len(results) - len(timelen)


Out[121]:
201

In [122]:
201 / (len(results) + len(timelen))


Out[122]:
0.01747674115294322

And at least 1103 of the results are probably not accurate.


In [123]:
num = 0
for time in timelen:
    if time <= 0:
        num += 1
print(num)


1103

In [124]:
sorted(timelen)[:20]


Out[124]:
[-26791.0,
 -26747.0,
 -26693.0,
 -17280.0,
 -9736.0,
 -9241.0,
 -9240.0,
 -9240.0,
 -8940.0,
 -8940.0,
 -8880.0,
 -8671.0,
 -8641.0,
 -8641.0,
 -8641.0,
 -8641.0,
 -8641.0,
 -8640.0,
 -8640.0,
 -8640.0]

Let's turn this into a Pandas DataFrame.


In [125]:
df = pd.DataFrame({'timelen': timelen})

Okay, let's make a histogram showing how many incidents happen.


In [131]:
df.hist(bins = 28, range = (-120, 1560))


Out[131]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10bdde240>]], dtype=object)

Looks like those 1103 0-minute cases are all outliers that are going to mess things up.

Measures of Center

Mean


In [132]:
df.mean()


Out[132]:
timelen    9545.093156
dtype: float64

In [157]:
df[df['timelen'] >= 0].mean()


Out[157]:
timelen    11210.542883
dtype: float64

Median

The mean is a LOT higher than the median, which suggests the data is skewed.


In [133]:
df.median()


Out[133]:
timelen    1308.25
dtype: float64

In [158]:
df[df['timelen'] >= 0].median()


Out[158]:
timelen    1476.558333
dtype: float64

Mode


In [134]:
df.mode()


Out[134]:
timelen
0 0.0

In [159]:
df[df['timelen'] >= 0].mode()


Out[159]:
timelen
0 0.0

Measures of Spread

Range


In [164]:
max(timelen) - min(timelen)


Out[164]:
156085.0

In [161]:
df[df['timelen'] >= 0].max() - df[df['timelen'] >= 0].min()


Out[161]:
timelen    129294.0
dtype: float64

Standard Deviation


In [163]:
df.std()


Out[163]:
timelen    20885.63768
dtype: float64

In [162]:
df[df['timelen'] >= 0].std()


Out[162]:
timelen    21412.331573
dtype: float64

If you think outliers are numbers three standard deviations away from the mean, there are 68 outliers, but it doesn't catch all of the negative values.


In [137]:
df.mean() + df.std() * 3


Out[137]:
timelen    72202.006197
dtype: float64

In [138]:
df.mean() - df.std() * 3


Out[138]:
timelen   -53111.819884
dtype: float64

In [178]:
std_outliers = df[(df['timelen'] < -53111.819884) | (df['timelen'] > 72202.006197)]

In [179]:
len(std_outliers)


Out[179]:
68

In [165]:
std_outliers.head()


Out[165]:
timelen
31 85300.0
37 90871.0
142 116740.0
361 81365.0
491 72671.0

In [172]:
df[df['timelen'] >= 0].mean() + df[df['timelen'] >= 0].std() * 3


Out[172]:
timelen    75447.537602
dtype: float64

In [173]:
df[df['timelen'] >= 0].mean() - df[df['timelen'] >= 0].std() * 3


Out[173]:
timelen   -53026.451835
dtype: float64

In [175]:
std_outliers = df[df['timelen'] > 75447.537602]

In [176]:
len(std_outliers)


Out[176]:
62

In [177]:
std_outliers.head()


Out[177]:
timelen
31 85300.0
37 90871.0
142 116740.0
361 81365.0
493 100622.0

Interquartile Range


In [145]:
IQR = df.quantile(q=0.75) - df.quantile(q=0.25)

In [146]:
IQR


Out[146]:
timelen    5716.808333
dtype: float64

In [181]:
IQR2 = df[df['timelen'] >= 0].quantile(q=0.75) - df[df['timelen'] >= 0].quantile(q=0.25)

In [182]:
IQR2


Out[182]:
timelen    7064.391667
dtype: float64

If you think outliers are 1.5 times the interquartile range above the 3rd quartile or below the 1st quartile, there are 1067 outliers, more than with the standard deviation outliers, but still not getting rid of the negative values.


In [152]:
df.quantile(q=0.25) - 1.5 * IQR


Out[152]:
timelen   -8501.2125
dtype: float64

In [153]:
df.quantile(q=0.75) + 1.5 * IQR


Out[153]:
timelen    14366.020833
dtype: float64

In [154]:
IQR_outliers = df[(df['timelen'] < -8501.2125) | (df['timelen'] > 14366.020833)]

In [155]:
len(IQR_outliers)


Out[155]:
1067

In [183]:
IQR_outliers.head()


Out[183]:
timelen
31 85300.000000
33 33507.000000
37 90871.000000
137 63416.366667
142 116740.000000

In [184]:
df[df['timelen'] >= 0].quantile(q=0.25) - 1.5 * IQR2


Out[184]:
timelen   -10423.5875
dtype: float64

In [185]:
df[df['timelen'] >= 0].quantile(q=0.75) + 1.5 * IQR2


Out[185]:
timelen    17833.979167
dtype: float64

In [186]:
IQR_outliers2 = df[(df['timelen'] > 14366.020833)]

In [187]:
len(IQR_outliers2)


Out[187]:
938

In [188]:
IQR_outliers2.head()


Out[188]:
timelen
31 85300.000000
33 33507.000000
37 90871.000000
137 63416.366667
142 116740.000000

Box and Whiskers


In [195]:
plt.boxplot(df['timelen'])


Out[195]:
{'boxes': [<matplotlib.lines.Line2D at 0x10c4481d0>],
 'caps': [<matplotlib.lines.Line2D at 0x10c44fbe0>,
  <matplotlib.lines.Line2D at 0x10c44fcf8>],
 'fliers': [<matplotlib.lines.Line2D at 0x10c455d68>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x10c455550>],
 'whiskers': [<matplotlib.lines.Line2D at 0x10c448b70>,
  <matplotlib.lines.Line2D at 0x10c448c88>]}

In [197]:


In [199]:
plt.boxplot(df[df['timelen'] >= 0]['timelen'])


Out[199]:
{'boxes': [<matplotlib.lines.Line2D at 0x10c73c0f0>],
 'caps': [<matplotlib.lines.Line2D at 0x10c743ac8>,
  <matplotlib.lines.Line2D at 0x10c743be0>],
 'fliers': [<matplotlib.lines.Line2D at 0x10c746c50>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x10c746438>],
 'whiskers': [<matplotlib.lines.Line2D at 0x10c73ca58>,
  <matplotlib.lines.Line2D at 0x10c73cb70>]}

In [ ]: