Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com
Port: 5432
Database: training
Username: dot_student
Password: qgis
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
In [2]:
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')
cursor = conn.cursor()
In [ ]:
# This can also work
# df = pd.read_sql("select * from noise_311", conn)
In [3]:
statement = "SELECT unique_key, complaint_type, created_date, closed_date FROM noise_311"
cursor.execute(statement)
complaints = []
for row in cursor:
complaints.append(row)
In [4]:
import pandas as pd
df = pd.DataFrame(complaints)
df.columns = ['unique_key', 'complaint', 'created_date', 'closed_date']
df_original = df.copy()
In [5]:
df.head()
Out[5]:
In [6]:
df['time'] = df['closed_date']-df['created_date']
df.head(3)
Out[6]:
In [7]:
df['time'].describe()
Out[7]:
Summary | DOT Values | NOISE Values |
---|---|---|
count | 4547 | 37615 |
mean | 8 days 16:15:03.514405 | 0 days 07:44:22.532500 |
std | 15 days 10:48:08.619990 | 1 days 12:06:09.012799 |
min | 0 days 00:00:01 | 0 days 00:01:00 |
25% | 0 days 12:41:58 | 0 days 00:54:00 |
50% | 1 days 03:33:17 | 0 days 02:02:00 |
75% | 6 days 00:30:23.500000 | 0 days 04:15:00 |
max | 89 days 18:54:00 | 48 days 05:45:00 |
In [9]:
df['time'].value_counts().sort_values(ascending=False)
Out[9]:
In [10]:
df.info()
In [12]:
df['time'].astype('timedelta64[h]')
Out[12]:
In [13]:
df['time']=(df['time'])/(pd.Timedelta(hours=1))
df['time'].head(10)
Out[13]:
In [14]:
df['time'].describe()
Out[14]:
DOT FILE count 4547.000000 mean 208.250976 std 370.802394 min 0.000278 25% 12.699444 50% 27.554722 75% 144.506528 max 2154.900000 Name: time, dtype: float64
In [18]:
df['time'].hist(bins=100)
plt.xlabel('Time (hours)')
plt.ylabel('# of complaints')
Out[18]:
In [ ]:
# Frequency Table
graph01 = df['complaint'].value_counts().head(3).sort_values(ascending=True).plot(kind='barh')
print("Top 3 popular type of complaint\n")
print(df['complaint'].value_counts().head(3).sort_values(ascending=False))
graph01
In [ ]:
df.index = df['created_date']
df.head(2)
In [ ]:
df.groupby(df.index.hour).count().plot(y='unique_key',kind='bar')
In [ ]: