Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com Port: 5432 Database: training Username: dot_student Password: qgis


In [87]:
import pandas as pd
import pg8000
conn = pg8000.connect(user='dot_student', host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', port=5432, database='training', password='qgis')

In [88]:
import dateutil.parser

In [89]:
conn.rollback()

In [90]:
cursor = conn.cursor()
print(type(cursor))


<class 'pg8000.core.Cursor'>

In [91]:
cursor.execute("select * from dot_311")

In [92]:
#This is magical. hat tip to Harsha's googling, I did not come up with this on my own.
df = pd.read_sql("select * from dot_311", conn)

In [93]:
df.head()


Out[93]:
b'gid' b'unique_key' b'agency' b'agency nam' b'complaint' b'descriptor' b'location t' b'incident z' b'incident a' b'street nam' ... b'bridge h_2' b'garage lot' b'ferry dire' b'ferry term' b'latitude' b'longitude' b'location' b'geom' b'created_date' b'closed_date'
0 2 32570549.0 DOT Department of Transportation Traffic Signal Condition Controller None 10301.0 None None ... None None None None 40.613804 -74.113374 (40.61380375315792, -74.1133737388642) 0101000020E61000006829ED83418752C0D7FC121F914E... 2016-02-01 00:12:00 2016-02-01 01:15:00
1 3 32572958.0 DOT Department of Transportation Traffic Signal Condition Controller None 10301.0 None None ... None None None None 40.626545 -74.091582 (40.62654529746493, -74.09158233448058) 0101000020E610000011DF267CDC8552C08809E5A23250... 2016-02-01 00:14:00 2016-02-01 00:54:00
2 4 32573576.0 DOT Department of Transportation Traffic Signal Condition Controller None 10310.0 None None ... None None None None 40.629250 -74.123221 (40.62924955478749, -74.1232207526046) 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:15:00 2016-02-01 01:30:00
3 5 32572389.0 DOT Department of Transportation Traffic Signal Condition Controller None NaN None None ... None None None None NaN NaN None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:17:00 2016-02-01 02:35:00
4 6 32570548.0 DOT Department of Transportation Traffic Signal Condition Controller None NaN None None ... None None None None NaN NaN None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:19:00 2016-02-01 01:00:00

5 rows × 55 columns


In [94]:
df.head()
df.columns
df.dtypes


Out[94]:
b'gid'                      int64
b'unique_key'             float64
b'agency'                  object
b'agency nam'              object
b'complaint'               object
b'descriptor'              object
b'location t'              object
b'incident z'             float64
b'incident a'              object
b'street nam'              object
b'cross stre'              object
b'cross st_1'              object
b'intersecti'              object
b'intersec_1'              object
b'address ty'              object
b'city'                    object
b'landmark'                object
b'facility t'              object
b'status'                  object
b'due date'                object
b'resolution'              object
b'resoluti_1'              object
b'community'               object
b'borough'                 object
b'x coordina'             float64
b'y coordina'             float64
b'park facil'              object
b'park borou'              object
b'school nam'              object
b'school num'              object
b'school reg'              object
b'school cod'              object
b'school pho'              object
b'school add'              object
b'school cit'              object
b'school sta'              object
b'school zip'              object
b'school not'              object
b'school or'               object
b'vehicle ty'              object
b'taxi compa'              object
b'taxi pick'               object
b'bridge hig'              object
b'bridge h_1'              object
b'road ramp'               object
b'bridge h_2'              object
b'garage lot'              object
b'ferry dire'              object
b'ferry term'              object
b'latitude'               float64
b'longitude'              float64
b'location'                object
b'geom'                    object
b'created_date'    datetime64[ns]
b'closed_date'     datetime64[ns]
dtype: object

In [95]:
#Renaming the columns because the apostrophes are weird.
df.columns = [b'gid',   b'unique_key',       b'agency',   b'agency nam',
          b'complaint',   b'descriptor',   b'location t',   b'incident z',
         b'incident a',   b'street nam',   b'cross stre',   b'cross st_1',
         b'intersecti',   b'intersec_1',   b'address ty',         b'city',
           b'landmark',   b'facility t',       b'status',     b'due date',
         b'resolution',   b'resoluti_1',    b'community',      b'borough',
         b'x coordina',   b'y coordina',   b'park facil',   b'park borou',
         b'school nam',   b'school num',   b'school reg',   b'school cod',
         b'school pho',   b'school add',   b'school cit',   b'school sta',
         b'school zip',   b'school not',    b'school or',   b'vehicle ty',
         b'taxi compa',    b'taxi pick',   b'bridge hig',   b'bridge h_1',
          b'road ramp',   b'bridge h_2',   b'garage lot',   b'ferry dire',
         b'ferry term',     b'latitude',    b'longitude',     b'location',
               b'geom', 'created_date',  'closed_date']

In [96]:
#Make a new column called "open_time", which is closed_date - created_date
df['open_time'] = (df['closed_date']-df['created_date']).astype('timedelta64[h]')

In [110]:
#So, there are clearly some issues with this data.
#This closed date is a significant amount of time before the created date.

df[df['open_time'] == -447.0]


Out[110]:
b'gid' b'unique_key' b'agency' b'agency nam' b'complaint' b'descriptor' b'location t' b'incident z' b'incident a' b'street nam' ... b'garage lot' b'ferry dire' b'ferry term' b'latitude' b'longitude' b'location' b'geom' created_date closed_date open_time
1625 1621 32585502.0 DOT Department of Transportation Street Light Condition Street Light Out None NaN EASTCHESTER BRIDGE EASTCHESTER BRIDGE ... None None None NaN NaN None 0101000020E610000024D5B235827252C019982D7FFD61... 2016-02-02 14:36:00 2016-01-15 00:05:00 -447.0

1 rows × 56 columns


In [104]:
#I am choosing to only include the values that are not "NaN" as a result of all the "NaT" in the closed date column.
#I assume these are still open, but I don't think it's appropriate to include them.
df[df['open_time'].notnull()]['open_time'].describe()


Out[104]:
count    5650.000000
mean      158.657345
std       348.049637
min      -447.000000
25%         1.000000
50%        21.000000
75%        96.000000
max      2154.000000
Name: open_time, dtype: float64

In [111]:
#75%
df[df['open_time'].notnull()]['open_time'].quantile(q=.75)


Out[111]:
96.0

In [115]:
#50%, aka the median
df[df['open_time'].notnull()]['open_time'].quantile(q=.5)


Out[115]:
21.0

In [114]:
#inter quartile range (between 50% and 75%)
df[df['open_time'].notnull()]['open_time'].quantile(q=.75)-df[df['open_time'].notnull()]['open_time'].quantile(q=.5)


Out[114]:
75.0

In [125]:
#These are the outliers on top. 1.5 IQR + 3rd quartile
df[df['open_time'] > (112.5+96)]['open_time']


Out[125]:
12       224.0
32      1421.0
34       558.0
38      1514.0
140     1056.0
145     1945.0
178     1055.0
179     1055.0
185      710.0
190      768.0
191      720.0
193      720.0
208      793.0
209     1007.0
216     1007.0
229     1055.0
233      603.0
241     1054.0
248     1054.0
249      743.0
270     1054.0
279      792.0
282      576.0
285      742.0
287      913.0
292     1005.0
304      719.0
310     1005.0
311     1005.0
313      942.0
         ...  
5742    1150.0
5744     661.0
5745     399.0
5748     642.0
5753    1149.0
5755    1120.0
5757     233.0
5758    1124.0
5763     278.0
5764    1117.0
5767     229.0
5769     903.0
5773     997.0
5774     449.0
5778     981.0
5779     270.0
5785     235.0
5786     448.0
5789     445.0
5790     233.0
5801     377.0
5808     256.0
5809     658.0
5810     612.0
5811     782.0
5828     924.0
5830     467.0
5833     371.0
5834     250.0
5839     226.0
Name: open_time, dtype: float64

In [124]:
#These are the outliers on the bottom. This data is lying. These points should be discarded, clearly. 1st quartile - 1.5 IQR
df[df['open_time'] < (1-112.5)]['open_time']


Out[124]:
676    -120.0
679    -163.0
1625   -447.0
1630   -446.0
1633   -445.0
3316   -144.0
3671   -144.0
3674   -144.0
3675   -144.0
3677   -144.0
3678   -144.0
3679   -144.0
3680   -144.0
3681   -145.0
3683   -144.0
3705   -144.0
3709   -144.0
3710   -144.0
3712   -144.0
3715   -144.0
3716   -144.0
3717   -145.0
3718   -144.0
3719   -144.0
3720   -144.0
3721   -144.0
3722   -144.0
3725   -144.0
3726   -144.0
3727   -145.0
        ...  
3998   -144.0
3999   -144.0
4000   -144.0
4001   -148.0
4003   -144.0
4004   -144.0
4005   -144.0
4006   -144.0
4007   -149.0
4008   -144.0
4009   -144.0
4010   -144.0
4011   -144.0
4012   -154.0
4013   -144.0
4015   -144.0
4016   -144.0
4018   -144.0
4019   -144.0
4020   -144.0
4023   -144.0
4024   -144.0
4025   -144.0
4026   -144.0
4027   -144.0
4028   -149.0
4030   -145.0
4031   -144.0
4032   -144.0
4033   -144.0
Name: open_time, dtype: float64

In [ ]: