Perform a basic statistical analysis of the time DOT 311 (table is called dot_311) complaints are open (subtract closed date from created date)

Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com Port: 5432 Database: training Username: dot_student Password: qgis


In [1]:
!pip install pg8000


Collecting pg8000
  Downloading pg8000-1.10.6-py2.py3-none-any.whl
Requirement already satisfied (use --upgrade to upgrade): six>=1.10.0 in /home/ec2-user/anaconda3/lib/python3.5/site-packages (from pg8000)
Installing collected packages: pg8000
Successfully installed pg8000-1.10.6

In [2]:
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', port= 5432, database= 'training', user= 'dot_student', password= 'qgis')

In [3]:
cursor = conn.cursor()

In [4]:
cursor.execute("select column_name from information_schema.columns where table_name='dot_311'")
column_name=[]
for item in cursor.fetchall():
    column_name.append(item)
#    print(item)

In [5]:
cursor.execute("select * from dot_311")
training=[]
for item in cursor.fetchall():
    #print(item)
    training.append(item)

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(training)
df.columns = ['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor', 'location t', 'incident z', 'incident a', 'street nam', 'cross stre', 'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city', 'landmark', 'facility t', 'status', 'due date', 'resolution', 'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina', 'park facil', 'park borou', 'school nam', 'school num', 'school reg', 'school cod', 'school pho', 'school add', 'school cit', 'school sta', 'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa', 'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2', 'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude', 'location', 'geom', 'created_date', 'closed_date']

In [8]:
df.describe()


Out[8]:
gid
count 5851.000000
mean 2926.000000
std 1689.182544
min 1.000000
25% 1463.500000
50% 2926.000000
75% 4388.500000
max 5851.000000

In [139]:
df['closed_date_clean'] = pd.to_datetime(df['closed_date'], 
                                         format='%m/%d/%y %H:%M:%S')

In [140]:
df['created_date_clean'] = pd.to_datetime(df['created_date'], 
                                  format='%m/%d/%y %H:%M:%S')

In [141]:
df['open_complaints_time'] = df['closed_date_clean'] - df['created_date_clean']

In [142]:
df.columns


Out[142]:
Index(['gid', 'unique_key', 'agency', 'agency nam', 'complaint', 'descriptor',
       'location t', 'incident z', 'incident a', 'street nam', 'cross stre',
       'cross st_1', 'intersecti', 'intersec_1', 'address ty', 'city',
       'landmark', 'facility t', 'status', 'due date', 'resolution',
       'resoluti_1', 'community', 'borough', 'x coordina', 'y coordina',
       'park facil', 'park borou', 'school nam', 'school num', 'school reg',
       'school cod', 'school pho', 'school add', 'school cit', 'school sta',
       'school zip', 'school not', 'school or', 'vehicle ty', 'taxi compa',
       'taxi pick', 'bridge hig', 'bridge h_1', 'road ramp', 'bridge h_2',
       'garage lot', 'ferry dire', 'ferry term', 'latitude', 'longitude',
       'location', 'geom', 'created_date', 'closed_date', 'closed_date_clean',
       'created_date_clean', 'open_complaints_time'],
      dtype='object')

In [152]:
df.head()


Out[152]:
gid unique_key agency agency nam complaint descriptor location t incident z incident a street nam ... ferry term latitude longitude location geom created_date closed_date closed_date_clean created_date_clean open_complaints_time
0 2 32570549 DOT Department of Transportation Traffic Signal Condition Controller None 10301 None None ... None 40.613803753157917 -74.113373738864198 (40.61380375315792, -74.1133737388642) 0101000020E61000006829ED83418752C0D7FC121F914E... 2016-02-01 00:12:00 2016-02-01 01:15:00 2016-02-01 01:15:00 2016-02-01 00:12:00 01:03:00
1 3 32572958 DOT Department of Transportation Traffic Signal Condition Controller None 10301 None None ... None 40.626545297464929 -74.091582334480577 (40.62654529746493, -74.09158233448058) 0101000020E610000011DF267CDC8552C08809E5A23250... 2016-02-01 00:14:00 2016-02-01 00:54:00 2016-02-01 00:54:00 2016-02-01 00:14:00 00:40:00
2 4 32573576 DOT Department of Transportation Traffic Signal Condition Controller None 10310 None None ... None 40.629249554787492 -74.123220752604595 (40.62924955478749, -74.1232207526046) 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:15:00 2016-02-01 01:30:00 2016-02-01 01:30:00 2016-02-01 00:15:00 01:15:00
3 5 32572389 DOT Department of Transportation Traffic Signal Condition Controller None None None None ... None None None None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:17:00 2016-02-01 02:35:00 2016-02-01 02:35:00 2016-02-01 00:17:00 02:18:00
4 6 32570548 DOT Department of Transportation Traffic Signal Condition Controller None None None None ... None None None None 0101000020E610000004A84BD9E28752C0DC6AD93F8B50... 2016-02-01 00:19:00 2016-02-01 01:00:00 2016-02-01 01:00:00 2016-02-01 00:19:00 00:41:00

5 rows × 58 columns


In [153]:
df['open_complaints_time'].mode()


Out[153]:
0   0 days
dtype: timedelta64[ns]

In [154]:
df['open_complaints_time'].median()


Out[154]:
Timedelta('0 days 21:48:15')

In [157]:
#How to find corresponding complaint?

#df[df['open_complaints_time'] == '21:48:15']


Out[157]:
gid unique_key agency agency nam complaint descriptor location t incident z incident a street nam ... ferry term latitude longitude location geom created_date closed_date closed_date_clean created_date_clean open_complaints_time

0 rows × 58 columns


In [158]:
df['open_complaints_time'].mean()


Out[158]:
Timedelta('6 days 15:05:05.589380')

In [161]:
df['open_complaints_time'].max()


Out[161]:
Timedelta('89 days 18:54:00')

In [163]:
df['open_complaints_time'].min()
#WHAT WHY


Out[163]:
Timedelta('-19 days +09:29:00')

In [164]:
df['open_complaints_time'].describe()


Out[164]:
count                       5650
mean      6 days 15:05:05.589380
std      14 days 12:05:38.260805
min           -19 days +09:29:00
25%              0 days 01:14:00
50%              0 days 21:48:15
75%       4 days 00:30:48.500000
max             89 days 18:54:00
Name: open_complaints_time, dtype: object

In [165]:
df['open_complaints_time'].quantile(q=0.25)


Out[165]:
Timedelta('0 days 01:14:00')

In [166]:
df['open_complaints_time'].quantile(q=0.75)


Out[166]:
Timedelta('4 days 00:30:48.500000')

In [ ]: