Assignment 1

Query the training database you used in homework 4-1 and query the noise_311 table Perform a basic statistical analysis of the time a complaint is left open Compare the results to what you found for DOT complaints. Submit this as Do Now 5

Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com

Port: 5432

Database: training

Username: dot_student

Password: qgis


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
import pg8000
from pandas import DataFrame

In [ ]:
conn.re

In [19]:
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", user='dot_student', password='qgis', database='training')
cursor = conn.cursor()


---------------------------------------------------------------------------
ProgrammingError                          Traceback (most recent call last)
<ipython-input-19-298b0e572a40> in <module>()
----> 1 conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", user='dot_student', password='qgis', database='training')
      2 cursor = conn.cursor()

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\__init__.py in connect(user, host, unix_sock, port, database, password, ssl, timeout, **kwargs)
    101     """
    102     return Connection(
--> 103         user, host, unix_sock, port, database, password, ssl, timeout)
    104 
    105 apilevel = "2.0"

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in __init__(self, user, host, unix_sock, port, database, password, ssl, timeout)
   1610                 except Exception:
   1611                     pass
-> 1612                 raise e
   1613 
   1614         self.in_transaction = False

c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in __init__(self, user, host, unix_sock, port, database, password, ssl, timeout)
   1604                     self.message_types[code](self._read(data_len - 4), None)
   1605                 if self.error is not None:
-> 1606                     raise self.error
   1607             except Exception as e:
   1608                 try:

ProgrammingError: ('FATAL', '53300', 'remaining connection slots are reserved for non-replication superuser and rds_superuser connections', 'postinit.c', '744', 'InitPostgres', '', '')

In [20]:
cursor.execute("select column_name from information_schema.columns where table_name='noise_311'")
column_list = []
for row in cursor.fetchall():
    column_list.append(row[0])
column_list


Out[20]:
['unique_key',
 'created_date',
 'closed_date',
 'agency',
 'agency_name',
 'complaint_type',
 'descriptor',
 'location_type',
 'incident_zip',
 'incident_address',
 'street_name',
 'cross_street_1',
 'cross_street_2',
 'intersection_street_1',
 'intersection_street_2',
 'address_type',
 'city',
 'landmark',
 'facility_type',
 'status',
 'due_date',
 'resolution_description',
 'resolution_action_updated_date',
 'community_board',
 'borough',
 'x_coordinate',
 'y_coordinate',
 'park_facility_name',
 'park_borough',
 'school_name',
 'school_number',
 'school_region',
 'school_code',
 'school_phone_number',
 'school_address',
 'school_city',
 'school_state',
 'school_zip',
 'school_not_found',
 'school_or_citywide_complaint',
 'vehicle_type',
 'taxi_company_borough',
 'taxi_pick_up_location',
 'bridge_highway_name',
 'bridge_highway_direction',
 'road_ramp',
 'bridge_highway_segment',
 'garage_lot_name',
 'ferry_direction',
 'ferry_terminal_name',
 'latitude',
 'longitude',
 'location']

In [21]:
response = "select * from noise_311"

In [22]:
cursor.execute("select * from noise_311")
output = []
for row in cursor.fetchall():
    output.append(row)

In [23]:
df = pd.DataFrame(data=output, columns=column_list)

In [24]:
df.head(3)


Out[24]:
unique_key created_date closed_date agency agency_name complaint_type descriptor location_type incident_zip incident_address ... bridge_highway_name bridge_highway_direction road_ramp bridge_highway_segment garage_lot_name ferry_direction ferry_terminal_name latitude longitude location
0 28792167 2014-08-31 23:59:00 2014-09-01 03:52:00 NYPD New York City Police Department Noise - Street/Sidewalk Loud Music/Party Street/Sidewalk 11222 200 KINGSLAND AVENUE ... None None None None None None None 40.7238883 -73.94134889 (40.723888303549415, -73.94134888943505)
1 28789088 2014-08-31 23:56:00 2014-09-01 06:17:00 NYPD New York City Police Department Noise - Vehicle Car/Truck Music Street/Sidewalk 11234 FLATLANDS AVENUE ... None None None None None None None 40.61948901 -73.93805105 (40.61948901090983, -73.93805104516916)
2 28791854 2014-08-31 23:54:00 2014-09-01 01:29:00 NYPD New York City Police Department Noise - Commercial Loud Music/Party Club/Bar/Restaurant 10002 161 LUDLOW STREET ... None None None None None None None 40.72141034 -73.98769444 (40.72141034382407, -73.98769444021134)

3 rows × 53 columns


In [27]:
sel_df = df[[
 'unique_key',
 'agency',
 'agency_name',
 'created_date',
 'closed_date',                  
 'complaint_type',
 'descriptor',
 'status',
 'due_date',
 'resolution_description',
 'city',
 'borough'
 ]]

In [28]:
sel_df.head(3)


Out[28]:
unique_key agency agency_name created_date closed_date complaint_type descriptor status due_date resolution_description city borough
0 28792167 NYPD New York City Police Department 2014-08-31 23:59:00 2014-09-01 03:52:00 Noise - Street/Sidewalk Loud Music/Party Closed 9/1/14 7:59 None BROOKLYN BROOKLYN
1 28789088 NYPD New York City Police Department 2014-08-31 23:56:00 2014-09-01 06:17:00 Noise - Vehicle Car/Truck Music Closed 9/1/14 7:56 None BROOKLYN BROOKLYN
2 28791854 NYPD New York City Police Department 2014-08-31 23:54:00 2014-09-01 01:29:00 Noise - Commercial Loud Music/Party Closed 9/1/14 7:54 None NEW YORK MANHATTAN

In [29]:
sel_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37615 entries, 0 to 37614
Data columns (total 12 columns):
unique_key                37615 non-null int64
agency                    37615 non-null object
agency_name               37615 non-null object
created_date              37615 non-null datetime64[ns]
closed_date               37615 non-null datetime64[ns]
complaint_type            37615 non-null object
descriptor                37615 non-null object
status                    37615 non-null object
due_date                  34758 non-null object
resolution_description    0 non-null object
city                      37602 non-null object
borough                   37615 non-null object
dtypes: datetime64[ns](2), int64(1), object(9)
memory usage: 2.2+ MB

In [78]:
#dateutil.parser.parse("2016-02-01 00:15:00")

In [30]:
sel_df['ClosedDate'] = pd.to_datetime(sel_df['closed_date'], format='%m/%d/%y %H:%M:%S')
sel_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37615 entries, 0 to 37614
Data columns (total 13 columns):
unique_key                37615 non-null int64
agency                    37615 non-null object
agency_name               37615 non-null object
created_date              37615 non-null datetime64[ns]
closed_date               37615 non-null datetime64[ns]
complaint_type            37615 non-null object
descriptor                37615 non-null object
status                    37615 non-null object
due_date                  34758 non-null object
resolution_description    0 non-null object
city                      37602 non-null object
borough                   37615 non-null object
ClosedDate                37615 non-null datetime64[ns]
dtypes: datetime64[ns](3), int64(1), object(9)
memory usage: 2.4+ MB
c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [31]:
#no need to do this step as created date and closed date are already in datetime format
sel_df['CreatedDate'] = pd.to_datetime(sel_df['created_date'], format='%m/%d/%y %H:%M:%S')
sel_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37615 entries, 0 to 37614
Data columns (total 14 columns):
unique_key                37615 non-null int64
agency                    37615 non-null object
agency_name               37615 non-null object
created_date              37615 non-null datetime64[ns]
closed_date               37615 non-null datetime64[ns]
complaint_type            37615 non-null object
descriptor                37615 non-null object
status                    37615 non-null object
due_date                  34758 non-null object
resolution_description    0 non-null object
city                      37602 non-null object
borough                   37615 non-null object
ClosedDate                37615 non-null datetime64[ns]
CreatedDate               37615 non-null datetime64[ns]
dtypes: datetime64[ns](4), int64(1), object(9)
memory usage: 2.7+ MB
c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [32]:
sel_df['delay'] = sel_df['ClosedDate'] - sel_df['CreatedDate']
sel_df['delay'].head(3)


c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[32]:
0   03:53:00
1   06:21:00
2   01:35:00
Name: delay, dtype: timedelta64[ns]

In [34]:
sel_df['delay'].isnull().value_counts()


Out[34]:
False    37615
Name: delay, dtype: int64

In [35]:
delay_delta = pd.DataFrame(sel_df['delay'][sel_df['delay']> pd.Timedelta(seconds=0)].dropna())
delay_delta.describe()


Out[35]:
delay
count 37615
mean 0 days 07:44:22.532500
std 1 days 12:06:09.012799
min 0 days 00:01:00
25% 0 days 00:54:00
50% 0 days 02:02:00
75% 0 days 04:15:00
max 48 days 05:45:00

In [36]:
delay_delta.head()


Out[36]:
delay
0 03:53:00
1 06:21:00
2 01:35:00
3 03:01:00
4 01:19:00

In [37]:
#now all the null values have been removed. great ! now i can test with a dataframe.
delay_delta['delay'].isnull().value_counts()


Out[37]:
False    37615
Name: delay, dtype: int64

In [41]:
#changing the delta value to seconds so that we can plot easily
delay_delta['delay'].astype('timedelta64[m]')


Out[41]:
0        233.0
1        381.0
2         95.0
3        181.0
4         79.0
5        186.0
6         64.0
7        258.0
8        288.0
9        205.0
10       292.0
11       156.0
12       234.0
13       313.0
14       186.0
15        60.0
16       207.0
17       812.0
18       105.0
19       168.0
20       354.0
21         7.0
22        33.0
23       171.0
24       327.0
25        21.0
26       132.0
27       166.0
28       176.0
29       313.0
         ...  
37585    182.0
37586    254.0
37587    111.0
37588    264.0
37589     42.0
37590    376.0
37591    111.0
37592     39.0
37593    287.0
37594     47.0
37595     50.0
37596    118.0
37597     88.0
37598    117.0
37599    127.0
37600    141.0
37601    185.0
37602    176.0
37603     55.0
37604     74.0
37605     95.0
37606    842.0
37607    192.0
37608    102.0
37609     72.0
37610    624.0
37611    144.0
37612    278.0
37613    134.0
37614    458.0
Name: delay, dtype: float64

In [43]:
# plotting time delata in days bucket
delay_delta['delay'].astype('timedelta64[m]').hist(bins=20)
plt.xlabel('Response in Minutes')
plt.ylabel('Frequency of Resonse')


Out[43]:
<matplotlib.text.Text at 0xb5def10>

In [44]:
# plotting time delata in days bucket
delay_delta['delay'].astype('timedelta64[D]').hist(bins=20)
plt.xlabel('Response in Minutes')
plt.ylabel('Frequency of Resonse')


Out[44]:
<matplotlib.text.Text at 0xb936710>

In [21]:
delay_delta['delay'].head()


Out[21]:
0   01:03:00
1   00:40:00
2   01:15:00
3   02:18:00
4   00:41:00
Name: delay, dtype: timedelta64[ns]

In [46]:
sel_df.head(3)


Out[46]:
unique_key agency agency_name created_date closed_date complaint_type descriptor status due_date resolution_description city borough ClosedDate CreatedDate delay
0 28792167 NYPD New York City Police Department 2014-08-31 23:59:00 2014-09-01 03:52:00 Noise - Street/Sidewalk Loud Music/Party Closed 9/1/14 7:59 None BROOKLYN BROOKLYN 2014-09-01 03:52:00 2014-08-31 23:59:00 03:53:00
1 28789088 NYPD New York City Police Department 2014-08-31 23:56:00 2014-09-01 06:17:00 Noise - Vehicle Car/Truck Music Closed 9/1/14 7:56 None BROOKLYN BROOKLYN 2014-09-01 06:17:00 2014-08-31 23:56:00 06:21:00
2 28791854 NYPD New York City Police Department 2014-08-31 23:54:00 2014-09-01 01:29:00 Noise - Commercial Loud Music/Party Closed 9/1/14 7:54 None NEW YORK MANHATTAN 2014-09-01 01:29:00 2014-08-31 23:54:00 01:35:00

In [47]:
delay_delta['delay'].head()


Out[47]:
0   03:53:00
1   06:21:00
2   01:35:00
3   03:01:00
4   01:19:00
Name: delay, dtype: timedelta64[ns]

In [48]:
delay_delta['delay'].value_counts().head()


Out[48]:
00:21:00    222
00:10:00    221
00:36:00    212
00:33:00    211
00:31:00    210
Name: delay, dtype: int64

In [49]:
delay_delta['delay'].value_counts().hist()
plt.xlabel('Delay')
plt.ylabel('Frequency of Delay')


Out[49]:
<matplotlib.text.Text at 0xba3def0>

In [51]:
delay_plot = pd.DataFrame(delay_delta['delay'].value_counts())
delay_plot.head()


Out[51]:
delay
00:21:00 222
00:10:00 221
00:36:00 212
00:33:00 211
00:31:00 210

In [51]:
delay_delta['delay'].describe()


Out[51]:
count                     37615
mean     0 days 07:44:22.532500
std      1 days 12:06:09.012799
min             0 days 00:01:00
25%             0 days 00:54:00
50%             0 days 02:02:00
75%             0 days 04:15:00
max            48 days 05:45:00
Name: delay, dtype: object
ERROR! Session/line number was not unique in database. History logging moved to new session 508
Traceback (most recent call last):

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\ipykernel\ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\interactiveshell.py", line 2729, in run_cell
    self.events.trigger('post_execute')

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\events.py", line 74, in trigger
    func(*args, **kwargs)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\ipykernel\pylab\backend_inline.py", line 113, in flush_figures
    return show(True)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\ipykernel\pylab\backend_inline.py", line 36, in show
    display(figure_manager.canvas.figure)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\display.py", line 158, in display
    format_dict, md_dict = format(obj, include=include, exclude=exclude)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\formatters.py", line 177, in format
    data = formatter(obj)

  File "<decorator-gen-9>", line 2, in __call__

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\formatters.py", line 222, in catch_format_error
    r = method(self, *args, **kwargs)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\formatters.py", line 339, in __call__
    return printer(obj)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\pylabtools.py", line 228, in <lambda>
    png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\IPython\core\pylabtools.py", line 119, in print_figure
    fig.canvas.print_figure(bytes_io, **kw)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\backend_bases.py", line 2180, in print_figure
    **kwargs)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\backends\backend_agg.py", line 527, in print_png
    FigureCanvasAgg.draw(self)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\backends\backend_agg.py", line 474, in draw
    self.figure.draw(self.renderer)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\artist.py", line 61, in draw_wrapper
    draw(artist, renderer, *args, **kwargs)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\figure.py", line 1159, in draw
    func(*args)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\artist.py", line 61, in draw_wrapper
    draw(artist, renderer, *args, **kwargs)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axes\_base.py", line 2324, in draw
    a.draw(renderer)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\artist.py", line 61, in draw_wrapper
    draw(artist, renderer, *args, **kwargs)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 1106, in draw
    ticks_to_draw = self._update_ticks(renderer)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 949, in _update_ticks
    tick_tups = [t for t in self.iter_ticks()]

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 949, in <listcomp>
    tick_tups = [t for t in self.iter_ticks()]

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 893, in iter_ticks
    majorTicks = self.get_major_ticks(len(majorLocs))

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 1294, in get_major_ticks
    tick = self._get_tick(major=True)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 1675, in _get_tick
    return XTick(self.axes, 0, '', major=major, **tick_kw)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 156, in __init__
    self.label2 = self._get_text2()

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 382, in _get_text2
    trans, vert, horiz = self._get_text2_transform()

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axis.py", line 343, in _get_text2_transform
    return self.axes.get_xaxis_text2_transform(self._pad)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\axes\_base.py", line 706, in get_xaxis_text2_transform
    self.figure.dpi_scale_trans),

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\transforms.py", line 1157, in __add__
    return composite_transform_factory(self, other)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\transforms.py", line 2473, in composite_transform_factory
    return CompositeGenericTransform(a, b)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\transforms.py", line 2285, in __init__
    Transform.__init__(self, **kwargs)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\matplotlib\transforms.py", line 98, in __init__
    self._parents = WeakValueDictionary()

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\weakref.py", line 120, in __init__
    self.update(*args, **kw)

  File "c:\users\radhika\appdata\local\programs\python\python35-32\lib\weakref.py", line 266, in update
    if len(args) > 1:

KeyboardInterrupt

In [52]:
delay_delta['delay'].quantile(q=0.25)


Out[52]:
Timedelta('0 days 00:54:00')
ERROR! Session/line number was not unique in database. History logging moved to new session 509

In [53]:
delay_delta['delay'].quantile(q=0.5)


Out[53]:
Timedelta('0 days 02:02:00')

In [54]:
delay_delta['delay'].quantile(q=0.75)


Out[54]:
Timedelta('0 days 04:15:00')

In [ ]: