Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com
Port: 5432
Database: training
Username: dot_student
Password: qgis
In [87]:
import pandas as pd
import pg8000
conn = pg8000.connect(user='dot_student', host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', port=5432, database='training', password='qgis')
In [88]:
import dateutil.parser
In [89]:
conn.rollback()
In [90]:
cursor = conn.cursor()
print(type(cursor))
In [91]:
cursor.execute("select * from dot_311")
In [92]:
#This is magical. hat tip to Harsha's googling, I did not come up with this on my own.
df = pd.read_sql("select * from dot_311", conn)
In [93]:
df.head()
Out[93]:
In [94]:
df.head()
df.columns
df.dtypes
Out[94]:
In [95]:
#Renaming the columns because the apostrophes are weird.
df.columns = [b'gid', b'unique_key', b'agency', b'agency nam',
b'complaint', b'descriptor', b'location t', b'incident z',
b'incident a', b'street nam', b'cross stre', b'cross st_1',
b'intersecti', b'intersec_1', b'address ty', b'city',
b'landmark', b'facility t', b'status', b'due date',
b'resolution', b'resoluti_1', b'community', b'borough',
b'x coordina', b'y coordina', b'park facil', b'park borou',
b'school nam', b'school num', b'school reg', b'school cod',
b'school pho', b'school add', b'school cit', b'school sta',
b'school zip', b'school not', b'school or', b'vehicle ty',
b'taxi compa', b'taxi pick', b'bridge hig', b'bridge h_1',
b'road ramp', b'bridge h_2', b'garage lot', b'ferry dire',
b'ferry term', b'latitude', b'longitude', b'location',
b'geom', 'created_date', 'closed_date']
In [96]:
#Make a new column called "open_time", which is closed_date - created_date
df['open_time'] = (df['closed_date']-df['created_date']).astype('timedelta64[h]')
In [110]:
#So, there are clearly some issues with this data.
#This closed date is a significant amount of time before the created date.
df[df['open_time'] == -447.0]
Out[110]:
In [104]:
#I am choosing to only include the values that are not "NaN" as a result of all the "NaT" in the closed date column.
#I assume these are still open, but I don't think it's appropriate to include them.
df[df['open_time'].notnull()]['open_time'].describe()
Out[104]:
In [111]:
#75%
df[df['open_time'].notnull()]['open_time'].quantile(q=.75)
Out[111]:
In [115]:
#50%, aka the median
df[df['open_time'].notnull()]['open_time'].quantile(q=.5)
Out[115]:
In [114]:
#inter quartile range (between 50% and 75%)
df[df['open_time'].notnull()]['open_time'].quantile(q=.75)-df[df['open_time'].notnull()]['open_time'].quantile(q=.5)
Out[114]:
In [125]:
#These are the outliers on top. 1.5 IQR + 3rd quartile
df[df['open_time'] > (112.5+96)]['open_time']
Out[125]:
In [124]:
#These are the outliers on the bottom. This data is lying. These points should be discarded, clearly. 1st quartile - 1.5 IQR
df[df['open_time'] < (1-112.5)]['open_time']
Out[124]:
In [ ]: