Host: training.c1erymiua9dx.us-east-1.rds.amazonaws.com
Port: 5432
Database: training
Username: dot_student
Password: qgis
In [75]:
import pg8000
import pandas as pd
In [76]:
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com',port=5432,database='training',user='dot_student',password='qgis')
In [77]:
conn.rollback()
In [71]:
cursor = conn.cursor()
statement="SELECT * from INFORMATION_SCHEMA.COLUMNs where table_name='dot_311'"
cursor.execute(statement)
columns=[]
for row in cursor.fetchall():
columns.append(row[3])
print(columns)
In [95]:
cursor = conn.cursor()
cursor.execute("SELECT unique_key,agency,complaint,descriptor,created_date,closed_date,borough from dot_311")
complaints=[]
for item in cursor:
complaints.append(item)
In [145]:
df = pd.DataFrame(complaints)
df.columns = ["unique_key","agency","complaint","descriptor","created_date","closed_date","borough"]
In [146]:
df['efficiency']=df['closed_date']-df['created_date']
In [99]:
df
Out[99]:
In [104]:
%matplotlib inline
In [112]:
df['efficiency'].describe()
Out[112]:
In [204]:
df['eff_new']=df['closed_date']-df['created_date']
In [205]:
def get_days(t):
try:
return t.days
except:
pass
df['eff_new'] = df['eff_new'].apply(get_days)
In [206]:
df['eff_new']
Out[206]:
In [207]:
df[df['eff_new']>0]['eff_new'].hist(bins=20)
Out[207]:
In [116]:
df2 = pd.DataFrame(complaints)
df2.columns = ["unique_key","agency","complaint","descriptor","created_date","closed_date","borough"]
df2['efficiency']=df2['closed_date']-df2['created_date']
In [125]:
df2.index=df2['efficiency']
In [168]:
df2.groupby(by=df2.index.days)['unique_key'].count().plot()
Out[168]:
In [156]:
df.info()
In [162]:
df.groupby(by='complaint').count()['unique_key'].sort_values().plot(kind="barh")
Out[162]:
In [163]:
df.groupby(by='borough').count()['unique_key'].sort_values().plot(kind="barh")
Out[163]:
In [210]:
new_df = df[df['eff_new']>0]
new_df['eff_new'].describe()
Out[210]:
In [211]:
new_df['eff_new'].median()
Out[211]:
In [213]:
new_df['eff_new'].mode()
Out[213]:
In [212]:
# UAL
28+27*1.5
Out[212]:
In [216]:
# outlier
new_df[new_df['eff_new']>68.5].count()
Out[216]:
In [ ]:
In [ ]: