Assignment 1

Perform a basic statistical analysis of the time DOT 311 (table is called dot_311) complaints are open (subtract closed date from created date)
Connect to the database to get the data and do the analysis. Submit the code through Github and type up your results in your PR

Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com

Port: 5432

Database: training

Username: dot_student

Password: qgis


In [146]:
import pandas as pd
import pg8000 as pg
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta

In [82]:
engine = create_engine('postgresql+pg8000://dot_student:qgis@training.c1erymiua9dx.us-east-1.rds.amazonaws.com:5432/training')

In [85]:
con=engine.raw_connection()

In [86]:
df=pd.read_sql_query('select * from dot_311', con=con)

In [88]:
len(df)


Out[88]:
5851

In [89]:
df.head(5)


Out[89]:
b'gid' b'unique_key' b'agency' b'agency nam' b'complaint' b'descriptor' b'location t' b'incident z' b'incident a' b'street nam' ... b'bridge h_2' b'garage lot' b'ferry dire' b'ferry term' b'latitude' b'longitude' b'location' b'geom' b'created_date' b'closed_date'
0 2 32570549.0 DOT Department of Transportation Traffic Signal Condition Controller None 10301.0 None None ... None None None None 40.613804 -74.113374 (40.61380375315792, -74.1133737388642) 0101000020E61000006829ED83418752C0D7FC121F914E... 2016-02-01 00:12:00 2016-02-01 01:15:00
1 3 32572958.0 DOT Department of Transportation Traffic Signal Condition Controller None 10301.0 None None ... None None None None 40.626545 -74.091582 (40.62654529746493, -74.09158233448058) 0101000020E610000011DF267CDC8552C08809E5A23250... 2016-02-01 00:14:00 2016-02-01 00:54:00
2 4 32573576.0 DOT Department of Transportation Traffic Signal Condition Controller None 10310.0 None None ... None None None None 40.629250 -74.123221 (40.62924955478749, -74.1232207526046) 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:15:00 2016-02-01 01:30:00
3 5 32572389.0 DOT Department of Transportation Traffic Signal Condition Controller None NaN None None ... None None None None NaN NaN None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:17:00 2016-02-01 02:35:00
4 6 32570548.0 DOT Department of Transportation Traffic Signal Condition Controller None NaN None None ... None None None None NaN NaN None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:19:00 2016-02-01 01:00:00

5 rows × 55 columns


In [92]:
df.columns
# at first i thought this meant my data was messed up, but these are all just bytes literal (apparently). i had to stringfy them in place


Out[92]:
Index([         b'gid',   b'unique_key',       b'agency',   b'agency nam',
          b'complaint',   b'descriptor',   b'location t',   b'incident z',
         b'incident a',   b'street nam',   b'cross stre',   b'cross st_1',
         b'intersecti',   b'intersec_1',   b'address ty',         b'city',
           b'landmark',   b'facility t',       b'status',     b'due date',
         b'resolution',   b'resoluti_1',    b'community',      b'borough',
         b'x coordina',   b'y coordina',   b'park facil',   b'park borou',
         b'school nam',   b'school num',   b'school reg',   b'school cod',
         b'school pho',   b'school add',   b'school cit',   b'school sta',
         b'school zip',   b'school not',    b'school or',   b'vehicle ty',
         b'taxi compa',    b'taxi pick',   b'bridge hig',   b'bridge h_1',
          b'road ramp',   b'bridge h_2',   b'garage lot',   b'ferry dire',
         b'ferry term',     b'latitude',    b'longitude',     b'location',
               b'geom', b'created_date',  b'closed_date'],
      dtype='object')

In [100]:
df.rename?

In [101]:
for bytes_name in df.columns:
    df.rename(columns = {bytes_name:bytes_name.decode('utf-8')}, inplace = True)
    #print(bytes_name.decode('utf-8'))

In [102]:
df.columns


Out[102]:
Index(['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor',
       'location t', 'incident z', 'incident a', 'street nam', 'cross stre',
       'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city',
       'landmark', 'facility t', 'status', 'due date', 'resolution',
       'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina',
       'park facil', 'park borou', 'school nam', 'school num', 'school reg',
       'school cod', 'school pho', 'school add', 'school cit', 'school sta',
       'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa',
       'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2',
       'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude',
       'location', 'geom', 'created_date', 'closed_date'],
      dtype='object')

In [105]:
df.groupby(by='agency').describe()


/usr/local/lib/python3.5/dist-packages/numpy/lib/function_base.py:3823: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[105]:
gid incident z latitude longitude unique_key x coordina y coordina
agency
DOT count 5851.000000 4400.000000 4378.000000 4378.000000 5.851000e+03 4.378000e+03 4378.000000
mean 2926.000000 10890.437045 40.712964 -73.923701 3.259704e+07 1.005386e+06 199054.304249
std 1689.182544 542.103798 0.083395 0.101716 1.184694e+04 2.821170e+04 30382.847763
min 1.000000 10001.000000 40.501800 -74.248608 3.257032e+07 9.151240e+05 122190.000000
25% 1463.500000 NaN NaN NaN 3.258716e+07 NaN NaN
50% 2926.000000 NaN NaN NaN 3.259820e+07 NaN NaN
75% 4388.500000 NaN NaN NaN 3.260721e+07 NaN NaN
max 5851.000000 11697.000000 40.910130 -73.701496 3.264560e+07 1.066961e+06 270878.000000

In [ ]:
# apparently every complaint is DOT? that doesn't seem right but i tried pulling the data again and got the same thing. i might be pulling wrong, but
# we will finish with what we have

In [108]:
df['closed_date'][0]-df['created_date'][0]


Out[108]:
Timedelta('0 days 01:03:00')

In [110]:
df['open_duration']=df['closed_date']-df['created_date']

In [115]:
df['created_date'].head(5)


Out[115]:
0   2016-02-01 00:12:00
1   2016-02-01 00:14:00
2   2016-02-01 00:15:00
3   2016-02-01 00:17:00
4   2016-02-01 00:19:00
Name: created_date, dtype: datetime64[ns]

In [114]:
df.plot(kind='scatter', x='created_date', y='open_duration')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1944             try:
-> 1945                 return self._engine.get_loc(key)
   1946             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'created_date'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-114-0537a0f6435a> in <module>()
----> 1 df.plot(kind='scatter', x='created_date', y='open_duration')

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   3738                           fontsize=fontsize, colormap=colormap, table=table,
   3739                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3740                           sort_columns=sort_columns, **kwds)
   3741     __call__.__doc__ = plot_frame.__doc__
   3742 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2612                  yerr=yerr, xerr=xerr,
   2613                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2614                  **kwds)
   2615 
   2616 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   2439         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2440 
-> 2441     plot_obj.generate()
   2442     plot_obj.draw()
   2443     return plot_obj.result

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in generate(self)
   1026         self._compute_plot_data()
   1027         self._setup_subplots()
-> 1028         self._make_plot()
   1029         self._add_table()
   1030         self._make_legend()

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _make_plot(self)
   1598         else:
   1599             label = None
-> 1600         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
   1601                              label=label, cmap=cmap, **self.kwds)
   1602         if cb:

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'created_date'

In [ ]:
# apparently matplotlib doesnt like datetime64, so lets convert them all

In [129]:
df['created']=pd.to_datetime(df['created_date'])

In [130]:
df['closed']=pd.to_datetime(df['closed_date'])

In [132]:
df['duration']=df['closed']-df['created']

In [135]:
plt.plot_date(df['created'], df['duration'])


Out[135]:
[<matplotlib.lines.Line2D at 0x7f9395d2ce10>]

In [ ]:
# that is just about the most worthless chart ive ever seen. what even is that?

In [140]:
df['duration'].describe()


Out[140]:
count                       5650
mean      6 days 15:05:05.589380
std      14 days 12:05:38.260805
min           -19 days +09:29:00
25%              0 days 01:14:00
50%              0 days 21:48:15
75%       4 days 00:30:48.500000
max             89 days 18:54:00
Name: duration, dtype: object

In [142]:
# oh that might be why. the minimum time is -19 days, neat

In [151]:
df_2=df[df['duration'] > timedelta(minutes=1)]

In [152]:
plt.plot_date(df_2['created'], df_2['duration'])


Out[152]:
[<matplotlib.lines.Line2D at 0x7f9395c54b00>]

In [153]:
# i am still not certain what that might mean, but its better than the first one

In [158]:
df_2['duration'].describe()


Out[158]:
count                       4543
mean      8 days 16:26:03.597182
std      15 days 10:54:50.797986
min              0 days 00:05:00
25%              0 days 12:47:00
50%              1 days 03:33:59
75%              6 days 00:35:34
max             89 days 18:54:00
Name: duration, dtype: object

In [161]:
df_2.index=df_2['created']

In [162]:
df_2.resample('H').count().plot(y='unique_key')


Out[162]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9395cc6240>

In [163]:
# thats kind of a more meaningful chart? im getting somewhere, i think

In [167]:
df_2['duration'].groupby(df_2.index.hour).describe()


Out[167]:
0   count                         53
    mean      2 days 13:38:01.396226
    std       6 days 01:42:00.802987
    min              0 days 00:10:00
    25%              0 days 02:18:00
    50%              0 days 20:49:07
    75%              1 days 12:50:00
    max             38 days 11:01:00
1   count                         45
    mean      7 days 02:11:48.555555
    std      17 days 02:01:19.208801
    min              0 days 00:15:00
    25%              0 days 12:04:00
    50%              1 days 00:33:04
    75%              5 days 04:25:25
    max             81 days 22:45:00
2   count                         23
    mean      4 days 07:23:59.173913
    std      14 days 11:17:39.886199
    min              0 days 00:10:00
    25%              0 days 00:41:00
    50%              0 days 06:56:59
    75%       1 days 09:52:09.500000
    max             69 days 21:55:00
3   count                         33
    mean      4 days 13:08:18.666666
    std      11 days 09:16:13.973938
    min              0 days 00:10:00
    25%              0 days 00:48:00
    50%              0 days 05:36:00
                      ...           
20  std      12 days 08:40:36.335564
    min              0 days 00:20:00
    25%              0 days 01:37:00
    50%              0 days 01:57:00
    75%              1 days 15:16:45
    max             84 days 03:08:00
21  count                        100
    mean      5 days 10:50:59.520000
    std      11 days 14:07:18.095542
    min              0 days 00:11:00
    25%              0 days 13:12:30
    50%              1 days 14:18:30
    75%       3 days 21:33:02.500000
    max             78 days 18:49:00
22  count                        135
    mean      3 days 11:40:20.066666
    std       7 days 08:18:15.404167
    min              0 days 00:25:00
    25%              0 days 14:36:20
    50%              0 days 20:00:44
    75%              2 days 14:12:30
    max             46 days 14:12:00
23  count                        125
    mean      3 days 13:02:37.784000
    std       9 days 15:59:37.987178
    min              0 days 00:10:00
    25%              0 days 14:36:00
    50%              0 days 21:04:18
    75%              2 days 11:00:45
    max             84 days 14:59:12
Name: duration, dtype: object

In [168]:
# well, that was ticket duration by hour opened in the day. now to plot that somehow

In [177]:
df_2['duration'].groupby(df_2.index.hour).plot(y='duration')


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-177-325164046044> in <module>()
----> 1 df_2['duration'].groupby(df_2.index.hour).plot(y='duration')

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in __call__(self, *args, **kwargs)
    309             return self.plot(*args, **kwargs)
    310         f.__name__ = 'plot'
--> 311         return self._groupby.apply(f)
    312 
    313     def __getattr__(self, name):

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
    649         # ignore SettingWithCopy here in case the user mutates
    650         with option_context('mode.chained_assignment', None):
--> 651             return self._python_apply_general(f)
    652 
    653     def _python_apply_general(self, f):

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in _python_apply_general(self, f)
    653     def _python_apply_general(self, f):
    654         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 655                                                    self.axis)
    656 
    657         return self._wrap_applied_output(

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in apply(self, f, data, axis)
   1525             # group might be modified
   1526             group_axes = _get_axes(group)
-> 1527             res = f(group)
   1528             if not _is_indexed_like(res, group_axes):
   1529                 mutated = True

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in f(g)
    645         @wraps(func)
    646         def f(g):
--> 647             return func(g, *args, **kwargs)
    648 
    649         # ignore SettingWithCopy here in case the user mutates

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in f(self)
    307     def __call__(self, *args, **kwargs):
    308         def f(self):
--> 309             return self.plot(*args, **kwargs)
    310         f.__name__ = 'plot'
    311         return self._groupby.apply(f)

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in __call__(self, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   3564                            colormap=colormap, table=table, yerr=yerr,
   3565                            xerr=xerr, label=label, secondary_y=secondary_y,
-> 3566                            **kwds)
   3567     __call__.__doc__ = plot_series.__doc__
   3568 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in plot_series(data, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2643                  yerr=yerr, xerr=xerr,
   2644                  label=label, secondary_y=secondary_y,
-> 2645                  **kwds)
   2646 
   2647 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   2439         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2440 
-> 2441     plot_obj.generate()
   2442     plot_obj.draw()
   2443     return plot_obj.result

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in generate(self)
   1024     def generate(self):
   1025         self._args_adjust()
-> 1026         self._compute_plot_data()
   1027         self._setup_subplots()
   1028         self._make_plot()

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _compute_plot_data(self)
   1133         if is_empty:
   1134             raise TypeError('Empty {0!r}: no numeric data to '
-> 1135                             'plot'.format(numeric_data.__class__.__name__))
   1136 
   1137         self.data = numeric_data

TypeError: Empty 'DataFrame': no numeric data to plot

In [ ]: