In [1]:
import pg8000
import pandas as pd
import matplotlib
import numpy
%matplotlib inline
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", port=5432, database="training", user="dot_student", password="qgis")
In [2]:
cursor = conn.cursor()
In [3]:
conn.rollback()
In [4]:
cursor.execute("SELECT * FROM noise_311")
results = []
for item in cursor.fetchall():
results.append(item)
In [11]:
cursor.execute("select column_name from information_schema.columns where table_name='noise_311'")
columns_names = []
for item in cursor.fetchall():
print(item)
columns_names.append(item)
In [24]:
df= pd.DataFrame(results)
In [25]:
df.columns = ['unique_key', 'created_date', 'closed_date', 'agency', 'agency_name', 'complaint_type','descriptor','location_type',
'incident_zip',
'incident_address',
'street_name',
'cross_street_1',
'cross_street_2',
'intersection_street_1',
'intersection_street_2',
'address_type',
'city',
'landmark',
'facility_type',
'status',
'due_date',
'resolution_description',
'resolution_action_updated_date',
'community_board',
'borough',
'x_coordinate',
'y_coordinate',
'park_facility_name',
'park_borough',
'school_name',
'school_number',
'school_region',
'school_code',
'school_phone_number',
'school_address',
'school_city',
'school_state',
'school_zip',
'school_not_found',
'school_or_citywide_complaint',
'vehicle_type',
'taxi_company_borough',
'taxi_pick_up_location',
'bridge_highway_name',
'bridge_highway_direction',
'road_ramp',
'bridge_highway_segment',
'garage_lot_name',
'ferry_direction',
'ferry_terminal_name',
'latitude',
'longitude',
'location']
In [26]:
df.head()
Out[26]:
In [27]:
df.columns
Out[27]:
In [28]:
df['new_created_date'] = pd.to_datetime(df['created_date'],
format='%m/%d/%y %H:%M:%S')
In [29]:
df['new_closed_date'] = pd.to_datetime(df['closed_date'],
format='%m/%d/%y %H:%M:%S')
In [30]:
df['time_open'] = df['new_closed_date'] - df['new_created_date']
In [31]:
df.head()
Out[31]:
In [32]:
df['time_open'].value_counts()
Out[32]:
In [33]:
time_open = pd.DataFrame(df['time_open'][df['time_open'] > pd.Timedelta(minutes=0)].dropna())
In [34]:
time_open
Out[34]:
In [35]:
time_open.median()
Out[35]:
In [36]:
time_open.mean()
Out[36]:
In [37]:
time_open.mode()
Out[37]:
In [38]:
time_open['time_open'].quantile(q=0.25)
Out[38]:
In [39]:
time_open['time_open'].quantile(q=0.5)
Out[39]:
In [40]:
time_open['time_open'].quantile(q=0.75)
Out[40]:
In [41]:
iqr = (time_open['time_open'].quantile(q=0.75)) - (time_open['time_open'].quantile(q=0.25))
In [42]:
iqr
Out[42]:
In [43]:
ual = (iqr * 1.5) + time_open.median()
In [44]:
ual
Out[44]:
In [45]:
lal = (iqr * 1.5) - time_open.median()
In [46]:
lal
Out[46]:
In [47]:
numpy.std(time_open)
Out[47]:
In [48]:
time_open.max()
Out[48]:
In [49]:
time_open.min()
Out[49]:
In [50]:
data_range = time_open.max() - time_open.min()
In [51]:
data_range
Out[51]:
In [52]:
(time_open.astype('timedelta64[m]')).hist(bins=100, figsize=(10,5))
Out[52]: