In [1]:
import pg8000
import pandas as pd
import matplotlib
import numpy
%matplotlib inline
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", port=5432, database="training", user="dot_student", password="qgis")
In [2]:
cursor = conn.cursor()
In [3]:
conn.rollback()
In [4]:
cursor.execute("SELECT * FROM dot_311")
results = []
for item in cursor.fetchall():
results.append(item)
In [5]:
cursor.execute("select column_name from information_schema.columns where table_name='dot_311'")
columns_names = []
for item in cursor.fetchall():
print(item)
columns_names.append(item)
In [6]:
df= pd.DataFrame(results)
In [7]:
df.columns = ['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor', 'location t', 'incident z', 'incident a', 'street nam', 'cross stre', 'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city', 'landmark', 'facility t', 'status', 'due date', 'resolution', 'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina', 'park facil', 'park borou', 'school nam', 'school num', 'school reg', 'school cod', 'school pho', 'school add', 'school cit', 'school sta', 'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa', 'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2', 'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude', 'location', 'geom', 'created_date', 'closed_date']
In [8]:
df.head()
Out[8]:
In [9]:
df.columns
Out[9]:
In [10]:
df['new_created_date'] = pd.to_datetime(df['created_date'],
format='%m/%d/%y %H:%M:%S')
In [11]:
df['new_closed_date'] = pd.to_datetime(df['closed_date'],
format='%m/%d/%y %H:%M:%S')
In [12]:
df['time_open'] = df['new_closed_date'] - df['new_created_date']
In [13]:
df.head()
Out[13]:
In [14]:
df['time_open'].value_counts()
Out[14]:
In [15]:
time_open = pd.DataFrame(df['time_open'][df['time_open'] > pd.Timedelta(minutes=0)].dropna())
In [16]:
time_open
Out[16]:
In [17]:
time_open.median()
Out[17]:
In [18]:
time_open.mean()
Out[18]:
In [19]:
time_open.mode()
Out[19]:
In [20]:
time_open['time_open'].quantile(q=0.25)
Out[20]:
In [21]:
time_open['time_open'].quantile(q=0.5)
Out[21]:
In [22]:
time_open['time_open'].quantile(q=0.75)
Out[22]:
In [23]:
iqr = (time_open['time_open'].quantile(q=0.75)) - (time_open['time_open'].quantile(q=0.25))
In [24]:
iqr
Out[24]:
In [25]:
ual = (iqr * 1.5) + time_open.median()
In [26]:
ual
Out[26]:
In [27]:
lal = (iqr * 1.5) - time_open.median()
In [28]:
lal
Out[28]:
In [29]:
numpy.std(time_open)
Out[29]:
In [30]:
time_open.max()
Out[30]:
In [31]:
time_open.min()
Out[31]:
In [32]:
data_range = time_open.max() - time_open.min()
In [33]:
data_range
Out[33]:
In [67]:
(time_open.astype('timedelta64[m]')).hist(bins=100, figsize=(10,5))
Out[67]:
This data are skewed by some outliers, especially in the Q4: while 75% of the complaints have been open 6 days or less, the longest complaint has been open for 89 days - which is also the range of this data -, and the shortest is only one second long. This is also shown by the standard deviation, that is 15 days, and the difference between the mean (1 day) and the mean (8 days and a half). In fact, the most common value is 20 minutes.