notebook.community

Edit and run

Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com Port: 5432 Database: training Username: dot_student Password: qgis



In [87]:

    
import pandas as pd
import pg8000
conn = pg8000.connect(user='dot_student', host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', port=5432, database='training', password='qgis')



In [88]:

    
import dateutil.parser



In [89]:

    
conn.rollback()



In [90]:

    
cursor = conn.cursor()
print(type(cursor))









    



<class 'pg8000.core.Cursor'>



In [91]:

    
cursor.execute("select * from dot_311")



In [92]:

    
#This is magical. hat tip to Harsha's googling, I did not come up with this on my own.
df = pd.read_sql("select * from dot_311", conn)



In [93]:

    
df.head()









    Out[93]:






  
    
      
      b'gid'
      b'unique_key'
      b'agency'
      b'agency nam'
      b'complaint'
      b'descriptor'
      b'location t'
      b'incident z'
      b'incident a'
      b'street nam'
      ...
      b'bridge h_2'
      b'garage lot'
      b'ferry dire'
      b'ferry term'
      b'latitude'
      b'longitude'
      b'location'
      b'geom'
      b'created_date'
      b'closed_date'
    
  
  
    
      0
      2
      32570549.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301.0
      None
      None
      ...
      None
      None
      None
      None
      40.613804
      -74.113374
      (40.61380375315792, -74.1133737388642)
      0101000020E61000006829ED83418752C0D7FC121F914E...
      2016-02-01 00:12:00
      2016-02-01 01:15:00
    
    
      1
      3
      32572958.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10301.0
      None
      None
      ...
      None
      None
      None
      None
      40.626545
      -74.091582
      (40.62654529746493, -74.09158233448058)
      0101000020E610000011DF267CDC8552C08809E5A23250...
      2016-02-01 00:14:00
      2016-02-01 00:54:00
    
    
      2
      4
      32573576.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      10310.0
      None
      None
      ...
      None
      None
      None
      None
      40.629250
      -74.123221
      (40.62924955478749, -74.1232207526046)
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:15:00
      2016-02-01 01:30:00
    
    
      3
      5
      32572389.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      NaN
      None
      None
      ...
      None
      None
      None
      None
      NaN
      NaN
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:17:00
      2016-02-01 02:35:00
    
    
      4
      6
      32570548.0
      DOT
      Department of Transportation
      Traffic Signal Condition
      Controller
      None
      NaN
      None
      None
      ...
      None
      None
      None
      None
      NaN
      NaN
      None
      0101000020E610000004A84BD9E28752C0DC6AD93F8B50...
      2016-02-01 00:19:00
      2016-02-01 01:00:00
    
  

5 rows × 55 columns



In [94]:

    
df.head()
df.columns
df.dtypes









    Out[94]:





b'gid'                      int64
b'unique_key'             float64
b'agency'                  object
b'agency nam'              object
b'complaint'               object
b'descriptor'              object
b'location t'              object
b'incident z'             float64
b'incident a'              object
b'street nam'              object
b'cross stre'              object
b'cross st_1'              object
b'intersecti'              object
b'intersec_1'              object
b'address ty'              object
b'city'                    object
b'landmark'                object
b'facility t'              object
b'status'                  object
b'due date'                object
b'resolution'              object
b'resoluti_1'              object
b'community'               object
b'borough'                 object
b'x coordina'             float64
b'y coordina'             float64
b'park facil'              object
b'park borou'              object
b'school nam'              object
b'school num'              object
b'school reg'              object
b'school cod'              object
b'school pho'              object
b'school add'              object
b'school cit'              object
b'school sta'              object
b'school zip'              object
b'school not'              object
b'school or'               object
b'vehicle ty'              object
b'taxi compa'              object
b'taxi pick'               object
b'bridge hig'              object
b'bridge h_1'              object
b'road ramp'               object
b'bridge h_2'              object
b'garage lot'              object
b'ferry dire'              object
b'ferry term'              object
b'latitude'               float64
b'longitude'              float64
b'location'                object
b'geom'                    object
b'created_date'    datetime64[ns]
b'closed_date'     datetime64[ns]
dtype: object



In [95]:

    
#Renaming the columns because the apostrophes are weird.
df.columns = [b'gid',   b'unique_key',       b'agency',   b'agency nam',
          b'complaint',   b'descriptor',   b'location t',   b'incident z',
         b'incident a',   b'street nam',   b'cross stre',   b'cross st_1',
         b'intersecti',   b'intersec_1',   b'address ty',         b'city',
           b'landmark',   b'facility t',       b'status',     b'due date',
         b'resolution',   b'resoluti_1',    b'community',      b'borough',
         b'x coordina',   b'y coordina',   b'park facil',   b'park borou',
         b'school nam',   b'school num',   b'school reg',   b'school cod',
         b'school pho',   b'school add',   b'school cit',   b'school sta',
         b'school zip',   b'school not',    b'school or',   b'vehicle ty',
         b'taxi compa',    b'taxi pick',   b'bridge hig',   b'bridge h_1',
          b'road ramp',   b'bridge h_2',   b'garage lot',   b'ferry dire',
         b'ferry term',     b'latitude',    b'longitude',     b'location',
               b'geom', 'created_date',  'closed_date']



In [96]:

    
#Make a new column called "open_time", which is closed_date - created_date
df['open_time'] = (df['closed_date']-df['created_date']).astype('timedelta64[h]')



In [110]:

    
#So, there are clearly some issues with this data.
#This closed date is a significant amount of time before the created date.

df[df['open_time'] == -447.0]









    Out[110]:






  
    
      
      b'gid'
      b'unique_key'
      b'agency'
      b'agency nam'
      b'complaint'
      b'descriptor'
      b'location t'
      b'incident z'
      b'incident a'
      b'street nam'
      ...
      b'garage lot'
      b'ferry dire'
      b'ferry term'
      b'latitude'
      b'longitude'
      b'location'
      b'geom'
      created_date
      closed_date
      open_time
    
  
  
    
      1625
      1621
      32585502.0
      DOT
      Department of Transportation
      Street Light Condition
      Street Light Out
      None
      NaN
      EASTCHESTER BRIDGE
      EASTCHESTER BRIDGE
      ...
      None
      None
      None
      NaN
      NaN
      None
      0101000020E610000024D5B235827252C019982D7FFD61...
      2016-02-02 14:36:00
      2016-01-15 00:05:00
      -447.0
    
  

1 rows × 56 columns



In [104]:

    
#I am choosing to only include the values that are not "NaN" as a result of all the "NaT" in the closed date column.
#I assume these are still open, but I don't think it's appropriate to include them.
df[df['open_time'].notnull()]['open_time'].describe()









    Out[104]:





count    5650.000000
mean      158.657345
std       348.049637
min      -447.000000
25%         1.000000
50%        21.000000
75%        96.000000
max      2154.000000
Name: open_time, dtype: float64



In [111]:

    
#75%
df[df['open_time'].notnull()]['open_time'].quantile(q=.75)









    Out[111]:





96.0



In [115]:

    
#50%, aka the median
df[df['open_time'].notnull()]['open_time'].quantile(q=.5)









    Out[115]:





21.0



In [114]:

    
#inter quartile range (between 50% and 75%)
df[df['open_time'].notnull()]['open_time'].quantile(q=.75)-df[df['open_time'].notnull()]['open_time'].quantile(q=.5)









    Out[114]:





75.0



In [125]:

    
#These are the outliers on top. 1.5 IQR + 3rd quartile
df[df['open_time'] > (112.5+96)]['open_time']









    Out[125]:





12       224.0
32      1421.0
34       558.0
38      1514.0
140     1056.0
145     1945.0
178     1055.0
179     1055.0
185      710.0
190      768.0
191      720.0
193      720.0
208      793.0
209     1007.0
216     1007.0
229     1055.0
233      603.0
241     1054.0
248     1054.0
249      743.0
270     1054.0
279      792.0
282      576.0
285      742.0
287      913.0
292     1005.0
304      719.0
310     1005.0
311     1005.0
313      942.0
         ...  
5742    1150.0
5744     661.0
5745     399.0
5748     642.0
5753    1149.0
5755    1120.0
5757     233.0
5758    1124.0
5763     278.0
5764    1117.0
5767     229.0
5769     903.0
5773     997.0
5774     449.0
5778     981.0
5779     270.0
5785     235.0
5786     448.0
5789     445.0
5790     233.0
5801     377.0
5808     256.0
5809     658.0
5810     612.0
5811     782.0
5828     924.0
5830     467.0
5833     371.0
5834     250.0
5839     226.0
Name: open_time, dtype: float64



In [124]:

    
#These are the outliers on the bottom. This data is lying. These points should be discarded, clearly. 1st quartile - 1.5 IQR
df[df['open_time'] < (1-112.5)]['open_time']









    Out[124]:





676    -120.0
679    -163.0
1625   -447.0
1630   -446.0
1633   -445.0
3316   -144.0
3671   -144.0
3674   -144.0
3675   -144.0
3677   -144.0
3678   -144.0
3679   -144.0
3680   -144.0
3681   -145.0
3683   -144.0
3705   -144.0
3709   -144.0
3710   -144.0
3712   -144.0
3715   -144.0
3716   -144.0
3717   -145.0
3718   -144.0
3719   -144.0
3720   -144.0
3721   -144.0
3722   -144.0
3725   -144.0
3726   -144.0
3727   -145.0
        ...  
3998   -144.0
3999   -144.0
4000   -144.0
4001   -148.0
4003   -144.0
4004   -144.0
4005   -144.0
4006   -144.0
4007   -149.0
4008   -144.0
4009   -144.0
4010   -144.0
4011   -144.0
4012   -154.0
4013   -144.0
4015   -144.0
4016   -144.0
4018   -144.0
4019   -144.0
4020   -144.0
4023   -144.0
4024   -144.0
4025   -144.0
4026   -144.0
4027   -144.0
4028   -149.0
4030   -145.0
4031   -144.0
4032   -144.0
4033   -144.0
Name: open_time, dtype: float64



In [ ]:

	b'gid'	b'unique_key'	b'agency'	b'agency nam'	b'complaint'	b'descriptor'	b'location t'	b'incident z'	b'incident a'	b'street nam'	...	b'bridge h_2'	b'garage lot'	b'ferry dire'	b'ferry term'	b'latitude'	b'longitude'	b'location'	b'geom'	b'created_date'	b'closed_date'
0	2	32570549.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301.0	None	None	...	None	None	None	None	40.613804	-74.113374	(40.61380375315792, -74.1133737388642)	0101000020E61000006829ED83418752C0D7FC121F914E...	2016-02-01 00:12:00	2016-02-01 01:15:00
1	3	32572958.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10301.0	None	None	...	None	None	None	None	40.626545	-74.091582	(40.62654529746493, -74.09158233448058)	0101000020E610000011DF267CDC8552C08809E5A23250...	2016-02-01 00:14:00	2016-02-01 00:54:00
2	4	32573576.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	10310.0	None	None	...	None	None	None	None	40.629250	-74.123221	(40.62924955478749, -74.1232207526046)	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:15:00	2016-02-01 01:30:00
3	5	32572389.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	NaN	None	None	...	None	None	None	None	NaN	NaN	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:17:00	2016-02-01 02:35:00
4	6	32570548.0	DOT	Department of Transportation	Traffic Signal Condition	Controller	None	NaN	None	None	...	None	None	None	None	NaN	NaN	None	0101000020E610000004A84BD9E28752C0DC6AD93F8B50...	2016-02-01 00:19:00	2016-02-01 01:00:00