Assignment 1

Perform a basic statistical analysis of the time DOT 311 (table is called dot_311) complaints are open (subtract closed date from created date)
Connect to the database to get the data and do the analysis. Submit the code through Github and type up your results in your PR

Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com

Port: 5432

Database: training

Username: dot_student

Password: qgis



In [146]:

    
import pandas as pd
import pg8000 as pg
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta



In [82]:

    
engine = create_engine('postgresql+pg8000://dot_student:qgis@training.c1erymiua9dx.us-east-1.rds.amazonaws.com:5432/training')



In [85]:

    
con=engine.raw_connection()



In [86]:

    
df=pd.read_sql_query('select * from dot_311', con=con)



In [88]:

    
len(df)









    Out[88]:





5851



In [89]:

    
df.head(5)









    Out[89]:






  
    
      
      b'gid'
      b'unique_key'
      b'agency'
      b'agency nam'
      b'complaint'
      b'descriptor'
      b'location t'
      b'incident z'
      b'incident a'
      b'street nam'
      ...
      b'bridge h_2'
      b'garage lot'
      b'ferry dire'
      b'ferry term'
      b'latitude'
      b'longitude'
      b'location'
      b'geom'
      b'created_date'
      b'closed_date'
    
  
  
    
      0
      2
      32570549.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301.0
      None
      None
      ...
      None
      None
      None
      None
      40.613804
      -74.113374
      (40.61380375315792, -74.1133737388642)
      0101000020E61000006829ED83418752C0D7FC121F914E...
      2016-02-01 00:12:00
      2016-02-01 01:15:00
    
    
      1
      3
      32572958.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301.0
      None
      None
      ...
      None
      None
      None
      None
      40.626545
      -74.091582
      (40.62654529746493, -74.09158233448058)
      0101000020E610000011DF267CDC8552C08809E5A23250...
      2016-02-01 00:14:00
      2016-02-01 00:54:00
    
    
      2
      4
      32573576.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10310.0
      None
      None
      ...
      None
      None
      None
      None
      40.629250
      -74.123221
      (40.62924955478749, -74.1232207526046)
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:15:00
      2016-02-01 01:30:00
    
    
      3
      5
      32572389.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      NaN
      None
      None
      ...
      None
      None
      None
      None
      NaN
      NaN
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:17:00
      2016-02-01 02:35:00
    
    
      4
      6
      32570548.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      NaN
      None
      None
      ...
      None
      None
      None
      None
      NaN
      NaN
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:19:00
      2016-02-01 01:00:00
    
  

5 rows × 55 columns



In [92]:

    
df.columns
# at first i thought this meant my data was messed up, but these are all just bytes literal (apparently). i had to stringfy them in place









    Out[92]:





Index([         b'gid',   b'unique_key',       b'agency',   b'agency nam',
          b'complaint',   b'descriptor',   b'location t',   b'incident z',
         b'incident a',   b'street nam',   b'cross stre',   b'cross st_1',
         b'intersecti',   b'intersec_1',   b'address ty',         b'city',
           b'landmark',   b'facility t',       b'status',     b'due date',
         b'resolution',   b'resoluti_1',    b'community',      b'borough',
         b'x coordina',   b'y coordina',   b'park facil',   b'park borou',
         b'school nam',   b'school num',   b'school reg',   b'school cod',
         b'school pho',   b'school add',   b'school cit',   b'school sta',
         b'school zip',   b'school not',    b'school or',   b'vehicle ty',
         b'taxi compa',    b'taxi pick',   b'bridge hig',   b'bridge h_1',
          b'road ramp',   b'bridge h_2',   b'garage lot',   b'ferry dire',
         b'ferry term',     b'latitude',    b'longitude',     b'location',
               b'geom', b'created_date',  b'closed_date'],
      dtype='object')



In [100]:

    
df.rename?



In [101]:

    
for bytes_name in df.columns:
    df.rename(columns = {bytes_name:bytes_name.decode('utf-8')}, inplace = True)
    #print(bytes_name.decode('utf-8'))



In [102]:

    
df.columns









    Out[102]:





Index(['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor',
       'location t', 'incident z', 'incident a', 'street nam', 'cross stre',
       'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city',
       'landmark', 'facility t', 'status', 'due date', 'resolution',
       'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina',
       'park facil', 'park borou', 'school nam', 'school num', 'school reg',
       'school cod', 'school pho', 'school add', 'school cit', 'school sta',
       'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa',
       'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2',
       'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude',
       'location', 'geom', 'created_date', 'closed_date'],
      dtype='object')



In [105]:

    
df.groupby(by='agency').describe()









    



/usr/local/lib/python3.5/dist-packages/numpy/lib/function_base.py:3823: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)






    Out[105]:






  
    
      
      
      gid
      incident z
      latitude
      longitude
      unique_key
      x coordina
      y coordina
    
    
      agency
      
      
      
      
      
      
      
      
    
  
  
    
      DOT
      count
      5851.000000
      4400.000000
      4378.000000
      4378.000000
      5.851000e+03
      4.378000e+03
      4378.000000
    
    
      mean
      2926.000000
      10890.437045
      40.712964
      -73.923701
      3.259704e+07
      1.005386e+06
      199054.304249
    
    
      std
      1689.182544
      542.103798
      0.083395
      0.101716
      1.184694e+04
      2.821170e+04
      30382.847763
    
    
      min
      1.000000
      10001.000000
      40.501800
      -74.248608
      3.257032e+07
      9.151240e+05
      122190.000000
    
    
      25%
      1463.500000
      NaN
      NaN
      NaN
      3.258716e+07
      NaN
      NaN
    
    
      50%
      2926.000000
      NaN
      NaN
      NaN
      3.259820e+07
      NaN
      NaN
    
    
      75%
      4388.500000
      NaN
      NaN
      NaN
      3.260721e+07
      NaN
      NaN
    
    
      max
      5851.000000
      11697.000000
      40.910130
      -73.701496
      3.264560e+07
      1.066961e+06
      270878.000000



In [ ]:

    
# apparently every complaint is DOT? that doesn't seem right but i tried pulling the data again and got the same thing. i might be pulling wrong, but
# we will finish with what we have



In [108]:

    
df['closed_date'][0]-df['created_date'][0]









    Out[108]:





Timedelta('0 days 01:03:00')



In [110]:

    
df['open_duration']=df['closed_date']-df['created_date']



In [115]:

    
df['created_date'].head(5)









    Out[115]:





0   2016-02-01 00:12:00
1   2016-02-01 00:14:00
2   2016-02-01 00:15:00
3   2016-02-01 00:17:00
4   2016-02-01 00:19:00
Name: created_date, dtype: datetime64[ns]



In [114]:

    
df.plot(kind='scatter', x='created_date', y='open_duration')









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1944             try:
-> 1945                 return self._engine.get_loc(key)
   1946             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'created_date'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-114-0537a0f6435a> in <module>()
----> 1 df.plot(kind='scatter', x='created_date', y='open_duration')

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   3738                           fontsize=fontsize, colormap=colormap, table=table,
   3739                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3740                           sort_columns=sort_columns, **kwds)
   3741     __call__.__doc__ = plot_frame.__doc__
   3742 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2612                  yerr=yerr, xerr=xerr,
   2613                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2614                  **kwds)
   2615 
   2616 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   2439         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2440 
-> 2441     plot_obj.generate()
   2442     plot_obj.draw()
   2443     return plot_obj.result

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in generate(self)
   1026         self._compute_plot_data()
   1027         self._setup_subplots()
-> 1028         self._make_plot()
   1029         self._add_table()
   1030         self._make_legend()

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _make_plot(self)
   1598         else:
   1599             label = None
-> 1600         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
   1601                              label=label, cmap=cmap, **self.kwds)
   1602         if cb:

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'created_date'



In [ ]:

    
# apparently matplotlib doesnt like datetime64, so lets convert them all



In [129]:

    
df['created']=pd.to_datetime(df['created_date'])



In [130]:

    
df['closed']=pd.to_datetime(df['closed_date'])



In [132]:

    
df['duration']=df['closed']-df['created']



In [135]:

    
plt.plot_date(df['created'], df['duration'])









    Out[135]:





[<matplotlib.lines.Line2D at 0x7f9395d2ce10>]



In [ ]:

    
# that is just about the most worthless chart ive ever seen. what even is that?



In [140]:

    
df['duration'].describe()









    Out[140]:





count                       5650
mean      6 days 15:05:05.589380
std      14 days 12:05:38.260805
min           -19 days +09:29:00
25%              0 days 01:14:00
50%              0 days 21:48:15
75%       4 days 00:30:48.500000
max             89 days 18:54:00
Name: duration, dtype: object



In [142]:

    
# oh that might be why. the minimum time is -19 days, neat



In [151]:

    
df_2=df[df['duration'] > timedelta(minutes=1)]



In [152]:

    
plt.plot_date(df_2['created'], df_2['duration'])









    Out[152]:





[<matplotlib.lines.Line2D at 0x7f9395c54b00>]



In [153]:

    
# i am still not certain what that might mean, but its better than the first one



In [158]:

    
df_2['duration'].describe()









    Out[158]:





count                       4543
mean      8 days 16:26:03.597182
std      15 days 10:54:50.797986
min              0 days 00:05:00
25%              0 days 12:47:00
50%              1 days 03:33:59
75%              6 days 00:35:34
max             89 days 18:54:00
Name: duration, dtype: object



In [161]:

    
df_2.index=df_2['created']



In [162]:

    
df_2.resample('H').count().plot(y='unique_key')









    Out[162]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9395cc6240>



In [163]:

    
# thats kind of a more meaningful chart? im getting somewhere, i think



In [167]:

    
df_2['duration'].groupby(df_2.index.hour).describe()









    Out[167]:





0   count                         53
    mean      2 days 13:38:01.396226
    std       6 days 01:42:00.802987
    min              0 days 00:10:00
    25%              0 days 02:18:00
    50%              0 days 20:49:07
    75%              1 days 12:50:00
    max             38 days 11:01:00
1   count                         45
    mean      7 days 02:11:48.555555
    std      17 days 02:01:19.208801
    min              0 days 00:15:00
    25%              0 days 12:04:00
    50%              1 days 00:33:04
    75%              5 days 04:25:25
    max             81 days 22:45:00
2   count                         23
    mean      4 days 07:23:59.173913
    std      14 days 11:17:39.886199
    min              0 days 00:10:00
    25%              0 days 00:41:00
    50%              0 days 06:56:59
    75%       1 days 09:52:09.500000
    max             69 days 21:55:00
3   count                         33
    mean      4 days 13:08:18.666666
    std      11 days 09:16:13.973938
    min              0 days 00:10:00
    25%              0 days 00:48:00
    50%              0 days 05:36:00
                      ...           
20  std      12 days 08:40:36.335564
    min              0 days 00:20:00
    25%              0 days 01:37:00
    50%              0 days 01:57:00
    75%              1 days 15:16:45
    max             84 days 03:08:00
21  count                        100
    mean      5 days 10:50:59.520000
    std      11 days 14:07:18.095542
    min              0 days 00:11:00
    25%              0 days 13:12:30
    50%              1 days 14:18:30
    75%       3 days 21:33:02.500000
    max             78 days 18:49:00
22  count                        135
    mean      3 days 11:40:20.066666
    std       7 days 08:18:15.404167
    min              0 days 00:25:00
    25%              0 days 14:36:20
    50%              0 days 20:00:44
    75%              2 days 14:12:30
    max             46 days 14:12:00
23  count                        125
    mean      3 days 13:02:37.784000
    std       9 days 15:59:37.987178
    min              0 days 00:10:00
    25%              0 days 14:36:00
    50%              0 days 21:04:18
    75%              2 days 11:00:45
    max             84 days 14:59:12
Name: duration, dtype: object



In [168]:

    
# well, that was ticket duration by hour opened in the day. now to plot that somehow



In [177]:

    
df_2['duration'].groupby(df_2.index.hour).plot(y='duration')









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-177-325164046044> in <module>()
----> 1 df_2['duration'].groupby(df_2.index.hour).plot(y='duration')

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in __call__(self, *args, **kwargs)
    309             return self.plot(*args, **kwargs)
    310         f.__name__ = 'plot'
--> 311         return self._groupby.apply(f)
    312 
    313     def __getattr__(self, name):

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in apply(self, func, *args, **kwargs)
    649         # ignore SettingWithCopy here in case the user mutates
    650         with option_context('mode.chained_assignment', None):
--> 651             return self._python_apply_general(f)
    652 
    653     def _python_apply_general(self, f):

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in _python_apply_general(self, f)
    653     def _python_apply_general(self, f):
    654         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 655                                                    self.axis)
    656 
    657         return self._wrap_applied_output(

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in apply(self, f, data, axis)
   1525             # group might be modified
   1526             group_axes = _get_axes(group)
-> 1527             res = f(group)
   1528             if not _is_indexed_like(res, group_axes):
   1529                 mutated = True

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in f(g)
    645         @wraps(func)
    646         def f(g):
--> 647             return func(g, *args, **kwargs)
    648 
    649         # ignore SettingWithCopy here in case the user mutates

/usr/local/lib/python3.5/dist-packages/pandas/core/groupby.py in f(self)
    307     def __call__(self, *args, **kwargs):
    308         def f(self):
--> 309             return self.plot(*args, **kwargs)
    310         f.__name__ = 'plot'
    311         return self._groupby.apply(f)

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in __call__(self, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   3564                            colormap=colormap, table=table, yerr=yerr,
   3565                            xerr=xerr, label=label, secondary_y=secondary_y,
-> 3566                            **kwds)
   3567     __call__.__doc__ = plot_series.__doc__
   3568 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in plot_series(data, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2643                  yerr=yerr, xerr=xerr,
   2644                  label=label, secondary_y=secondary_y,
-> 2645                  **kwds)
   2646 
   2647 

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   2439         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2440 
-> 2441     plot_obj.generate()
   2442     plot_obj.draw()
   2443     return plot_obj.result

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in generate(self)
   1024     def generate(self):
   1025         self._args_adjust()
-> 1026         self._compute_plot_data()
   1027         self._setup_subplots()
   1028         self._make_plot()

/usr/local/lib/python3.5/dist-packages/pandas/tools/plotting.py in _compute_plot_data(self)
   1133         if is_empty:
   1134             raise TypeError('Empty {0!r}: no numeric data to '
-> 1135                             'plot'.format(numeric_data.__class__.__name__))
   1136 
   1137         self.data = numeric_data

TypeError: Empty 'DataFrame': no numeric data to plot



In [ ]:

	b'gid'	b'unique_key'	b'agency'	b'agency nam'	b'complaint'	b'descriptor'	b'location t'	b'incident z'	b'incident a'	b'street nam'	...	b'bridge h_2'	b'garage lot'	b'ferry dire'	b'ferry term'	b'latitude'	b'longitude'	b'location'	b'geom'	b'created_date'	b'closed_date'
0	2	32570549.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301.0	None	None	...	None	None	None	None	40.613804	-74.113374	(40.61380375315792, -74.1133737388642)	0101000020E61000006829ED83418752C0D7FC121F914E...	2016-02-01 00:12:00	2016-02-01 01:15:00
1	3	32572958.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301.0	None	None	...	None	None	None	None	40.626545	-74.091582	(40.62654529746493, -74.09158233448058)	0101000020E610000011DF267CDC8552C08809E5A23250...	2016-02-01 00:14:00	2016-02-01 00:54:00
2	4	32573576.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10310.0	None	None	...	None	None	None	None	40.629250	-74.123221	(40.62924955478749, -74.1232207526046)	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:15:00	2016-02-01 01:30:00
3	5	32572389.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	NaN	None	None	...	None	None	None	None	NaN	NaN	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:17:00	2016-02-01 02:35:00
4	6	32570548.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	NaN	None	None	...	None	None	None	None	NaN	NaN	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:19:00	2016-02-01 01:00:00

		gid	incident z	latitude	longitude	unique_key	x coordina	y coordina
agency
DOT	count	5851.000000	4400.000000	4378.000000	4378.000000	5.851000e+03	4.378000e+03	4378.000000
	mean	2926.000000	10890.437045	40.712964	-73.923701	3.259704e+07	1.005386e+06	199054.304249
	std	1689.182544	542.103798	0.083395	0.101716	1.184694e+04	2.821170e+04	30382.847763
	min	1.000000	10001.000000	40.501800	-74.248608	3.257032e+07	9.151240e+05	122190.000000
	25%	1463.500000	NaN	NaN	NaN	3.258716e+07	NaN	NaN
	50%	2926.000000	NaN	NaN	NaN	3.259820e+07	NaN	NaN
	75%	4388.500000	NaN	NaN	NaN	3.260721e+07	NaN	NaN
	max	5851.000000	11697.000000	40.910130	-73.701496	3.264560e+07	1.066961e+06	270878.000000