In [1]:
import pandas as pd
import numpy as np
import datetime
data = pd.read_csv('311_data.csv')
In [2]:
ccab_data = data[['Created Date', 'Closed Date', 'Agency', 'Borough']]
ccab_data[0:5]
Out[2]:
In [3]:
ccab_data.info()
In [25]:
#ccab_data = ccab_data[(ccab_data['Closed Date'].notnull()) & (ccab_data['Borough'].notnull())]
#ccab_data.info()
In [6]:
ccab_data['Created Date'] = ccab_data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
ccab_data.info()
In [8]:
ccab_data['Closed Date'] = ccab_data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))
In [9]:
ccab_data.info()
In [10]:
ccab_data['processing_time'] = ccab_data['Closed Date'] - ccab_data['Created Date']
In [11]:
ccab_data[0:5]
Out[11]:
In [12]:
ccab_data['start_time_window'] = ccab_data['Created Date'].apply(lambda x:x.hour)
In [13]:
ccab_data.to_csv('output.csv')
In [14]:
data_output = pd.read_csv('output.csv')
In [16]:
def part1():
data = pd.read_csv('311_data.csv')
# Create a view of the dataframe containing specific columns
ccab_data = data[['Created Date', 'Closed Date', 'Agency', 'Borough']]
# Convert string to datetime type
ccab_data['Created Date'] = ccab_data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
ccab_data['Closed Date'] = ccab_data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))
# create a column 'processing_time'
ccab_data['processing_time'] = ccab_data['Closed Date'] - ccab_data['Created Date']
# Create a new column - 'start_time_window' - that contains the hour of the day that the incident report was created
ccab_data['start_time_window'] = ccab_data['Created Date'].apply(lambda x:x.hour)
# output a csv file named output1.csv
ccab_data.to_csv('output1.csv')
return ccab_data
In [17]:
part1()
In [201]:
import pandas as pd
import numpy as np
import datetime
%matplotlib inline
import seaborn as sns
plt.gcf().subplots_adjust(bottom=0.15)
#Your code starts here
data = pd.read_csv('311_data_2.csv')
# Convert string to datetime type
data['Created Date'] = data['Created Date'].apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
data['Closed Date'] = data['Closed Date'].apply(lambda x:pd.to_datetime(x, format='%m/%d/%Y %I:%M:%S %p', errors='ignore'))
# create a column 'processing_time'
data['processing_time'] = data['Closed Date'] - ccab_data['Created Date']
# Create a new column - 'start_time_window' - that contains the hour of the day that the incident report was created
data['start_time_window'] = data['Created Date'].apply(lambda x:x.hour)
In [202]:
def convert_time(x):
try:
x = np.timedelta64(x, 'h')
return x.astype('int').item()
except:
return 0
In [203]:
group_data = data
group_data['processing_time'] = group_data['processing_time'].apply(lambda x:convert_time(x))
#start_time_group['processing_time'] = start_time_group['processing_time'].apply(lambda x:np.timedelta64(x,'m'))
#start_time_group.size().plot(kind='bar', figsize=(8,5.5))
# plt.savefig("1.png",dpi=100)
# plt.clf()
#group_data['processing_time']
In [238]:
start_time_group = group_data.groupby('start_time_window').agg({'processing_time':['mean']})
#start_time_group.size().plot('bar', figsize=(8,5.5))
start_time_group['processing_time']['mean']
Out[238]:
In [244]:
a4_dims = (8, 5.5)
fig, ax = plt.subplots(figsize=a4_dims)
#seaborn.voilinplot(ax=ax, data=df, **violin_options)
plt.bar([0,1,2,3,4,5,6,7,8], start_time_group['processing_time']['mean'], align='center')
ax.set_title('start_time_window')
plt.savefig("1.png",dpi=100)
plt.clf()
In [40]:
a4_dims = (8, 5.5)
fig, ax = plt.subplots(figsize=a4_dims)
#seaborn.voilinplot(ax=ax, data=df, **violin_options)
sns.boxplot(x='start_time_window', y='processing_time',ax=ax, data=data)
plt.savefig("2.png",dpi=100)
plt.clf()
In [44]:
data.groupby('Agency').size().sort_values(ascending=False).plot(kind='bar',figsize=(8,5.5))
plt.savefig("3.png",dpi=100)
plt.clf()
In [51]:
COL_NUM = 2
ROW_NUM = 3
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(12,12))
for i, (label, col) in enumerate(data.groupby(['Agency', 'Borough']).size().unstack().iteritems()):
ax = axes[int(i/COL_NUM), i%COL_NUM]
col = col.sort_values(ascending=False)
ax.set_title(label)
col.plot(kind='barh', ax=ax)
plt.tight_layout()
plt.savefig("4.png",dpi=100)
plt.clf()
In [ ]: